Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

20f4a6e

1 Parent(s): 6b0e834

Updated app with code for deduplication

Browse files

Files changed (1) hide show

app.py +977 -630

app.py CHANGED Viewed

@@ -26,79 +26,6 @@ def batch_iterable(iterable, batch_size):
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
-def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
-    embeddings = []
-    for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
-        batch_embeddings = model.encode(batch, show_progressbar=False)
-        embeddings.append(batch_embeddings)
-    return np.concatenate(embeddings, axis=0)
-def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-    """
-    Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-    """
-    # Building the index
-    progress(0, desc="Building search index...")
-    reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-    deduplicated_indices = set(range(len(embedding_matrix)))
-    duplicate_to_original_mapping = {}
-    # Finding nearest neighbors
-    progress(0, desc="Finding nearest neighbors...")
-    results = reach.nearest_neighbor_threshold(
-        embedding_matrix,
-        threshold=threshold,
-        batch_size=batch_size,
-        show_progressbar=False  # Disable internal progress bar
-    )
-    # Processing duplicates with a progress bar
-    total_items = len(embedding_matrix)
-    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-        if i not in deduplicated_indices:
-            continue
-        similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-        for sim_idx in similar_indices:
-            if sim_idx in deduplicated_indices:
-                deduplicated_indices.remove(sim_idx)
-                duplicate_to_original_mapping[sim_idx] = i
-    return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-    """
-    Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-    """
-    # Building the index from Dataset 1
-    progress(0, desc="Building search index from Dataset 1...")
-    reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-    duplicate_indices_in_test = []
-    duplicate_to_original_mapping = {}
-    # Finding nearest neighbors between datasets
-    progress(0, desc="Finding nearest neighbors between datasets...")
-    results = reach.nearest_neighbor_threshold(
-        embedding_matrix_2,
-        threshold=threshold,
-        batch_size=batch_size,
-        show_progressbar=False  # Disable internal progress bar
-    )
-    total_items = len(embedding_matrix_2)
-    # Processing duplicates with a progress bar
-    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
-        similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-        if similar_indices:
-            duplicate_indices_in_test.append(i)
-            duplicate_to_original_mapping[i] = similar_indices[0]
-    return duplicate_indices_in_test, duplicate_to_original_mapping
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
     return " ".join([word for word in diff if word.startswith(('+', '-'))])
@@ -138,7 +65,13 @@ def perform_deduplication(
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
-            embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
             # Deduplicate
             status = "Deduplicating embeddings..."
@@ -205,12 +138,23 @@ def perform_deduplication(
             # Compute embeddings for Dataset 1
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
-            embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
             # Compute embeddings for Dataset 2
             status = "Computing embeddings for Dataset 2..."
             yield status, ""
-            embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
             # Deduplicate across datasets
             status = "Deduplicating embeddings across datasets..."
@@ -251,6 +195,72 @@ def perform_deduplication(
         yield f"An error occurred: {e}", ""
         raise e
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
@@ -317,14 +327,12 @@ demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 # from model2vec import StaticModel
 # from reach import Reach
 # from difflib import ndiff
-# import sys
 # import tqdm
 # # Load the model at startup
@@ -342,26 +350,41 @@ demo.launch()
 # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
 #     """
 #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
 #     """
 #     # Building the index
 #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
 #     # Finding nearest neighbors
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=True  # Allow internal progress bar
 #     )
-#     # Processing duplicates
-#     for i, similar_items in enumerate(results):
 #         if i not in deduplicated_indices:
 #             continue
@@ -374,26 +397,29 @@ demo.launch()
 #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
 #     """
 #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
 #     """
 #     # Building the index from Dataset 1
 #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 #     duplicate_indices_in_test = []
 #     duplicate_to_original_mapping = {}
 #     # Finding nearest neighbors between datasets
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix_2,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=True  # Allow internal progress bar
 #     )
-#     # Processing duplicates
-#     for i, similar_items in enumerate(results):
 #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 #         if similar_indices:
@@ -417,17 +443,10 @@ demo.launch()
 #     threshold=default_threshold,
 #     progress=gr.Progress(track_tqdm=True)
 # ):
-#     # Deep Monkey-Patching of tqdm
-#     original_tqdm = tqdm.tqdm
-#     tqdm.tqdm = progress.tqdm
-#     for mod_name in list(sys.modules.keys()):
-#         if 'tqdm' in mod_name:
-#             sys.modules[mod_name].tqdm = progress.tqdm
 #     try:
 #         # Convert threshold to float
 #         threshold = float(threshold)
 #         # Initialize status message
 #         status = ""
@@ -439,33 +458,33 @@ demo.launch()
 #                 ds = ds_default1
 #             else:
 #                 ds = load_dataset(dataset1_name, split=dataset1_split)
 #             # Extract texts
 #             status = "Extracting texts from Dataset 1..."
 #             yield status, ""
 #             texts = [example[dataset1_text_column] for example in ds]
 #             # Compute embeddings
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
-#             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
 #             # Deduplicate
 #             status = "Deduplicating embeddings..."
 #             yield status, ""
 #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-#                 embedding_matrix, threshold
 #             )
 #             # Prepare the results
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
 #             result_text = f"**Total documents:** {num_total}\n"
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 #             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
@@ -480,11 +499,11 @@ demo.launch()
 #                     result_text += "-" * 50 + "\n\n"
 #             else:
 #                 result_text += "No duplicates found."
 #             # Final status
 #             status = "Deduplication completed."
 #             yield status, result_text
 #         elif deduplication_type == "Cross-dataset":
 #             # Load Dataset 1
 #             status = "Loading Dataset 1..."
@@ -493,7 +512,7 @@ demo.launch()
 #                 ds1 = ds_default1
 #             else:
 #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
 #             # Load Dataset 2
 #             status = "Loading Dataset 2..."
 #             yield status, ""
@@ -501,42 +520,42 @@ demo.launch()
 #                 ds2 = ds_default2
 #             else:
 #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
 #             # Extract texts from Dataset 1
 #             status = "Extracting texts from Dataset 1..."
 #             yield status, ""
 #             texts1 = [example[dataset1_text_column] for example in ds1]
 #             # Extract texts from Dataset 2
 #             status = "Extracting texts from Dataset 2..."
 #             yield status, ""
 #             texts2 = [example[dataset2_text_column] for example in ds2]
 #             # Compute embeddings for Dataset 1
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
-#             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
 #             # Compute embeddings for Dataset 2
 #             status = "Computing embeddings for Dataset 2..."
 #             yield status, ""
-#             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
 #             # Deduplicate across datasets
 #             status = "Deduplicating embeddings across datasets..."
 #             yield status, ""
 #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-#                 embedding_matrix1, embedding_matrix2, threshold
 #             )
 #             num_duplicates = len(duplicate_indices_in_ds2)
 #             num_total_ds2 = len(texts2)
 #             num_unique_ds2 = num_total_ds2 - num_duplicates
-#             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-#             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 #             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
@@ -552,17 +571,14 @@ demo.launch()
 #                     result_text += "-" * 50 + "\n\n"
 #             else:
 #                 result_text += "No duplicates found."
 #             # Final status
 #             status = "Deduplication completed."
 #             yield status, result_text
-#     finally:
-#         # Restore original tqdm
-#         tqdm.tqdm = original_tqdm
-#         for mod_name in list(sys.modules.keys()):
-#             if 'tqdm' in mod_name:
-#                 sys.modules[mod_name].tqdm = original_tqdm
 # with gr.Blocks() as demo:
 #     gr.Markdown("# Semantic Deduplication")
@@ -614,605 +630,670 @@ demo.launch()
 #     compute_button.click(
 #         fn=perform_deduplication,
 #         inputs=[
-#             deduplication_type,
-#             dataset1_name,
-#             dataset1_split,
 #             dataset1_text_column,
-#             dataset2_name,
-#             dataset2_split,
 #             dataset2_text_column,
 #             threshold
 #         ],
 #         outputs=[status_output, result_output]
 #     )
 # demo.launch()
-# import gradio as gr
-# from datasets import load_dataset
-# import numpy as np
-# from model2vec import StaticModel
-# from reach import Reach
-# from difflib import ndiff
-# import sys
-# import tqdm
-# # Load the model at startup
-# model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # Update default dataset to 'sst2' and set default threshold to 0.9
-# default_dataset1_name = "sst2"
-# default_dataset1_split = "train"
-# default_dataset2_name = "sst2"
-# default_dataset2_split = "validation"
-# default_text_column = "sentence"
-# default_threshold = 0.9
-# # Load the default datasets at startup
-# ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-#     """
-#     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-#     """
-#     # Update progress to indicate building the index
-#     progress(0, desc="Building search index...")
-#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-#     deduplicated_indices = set(range(len(embedding_matrix)))
-#     duplicate_to_original_mapping = {}
-#     # Finding nearest neighbors
-#     progress(0, desc="Finding nearest neighbors...")
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=True  # Allow internal progress bar
-#     )
-#     # Processing duplicates with a progress bar
-#     total_items = len(embedding_matrix)
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-#         if i not in deduplicated_indices:
-#             continue
-#         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-#         for sim_idx in similar_indices:
-#             if sim_idx in deduplicated_indices:
-#                 deduplicated_indices.remove(sim_idx)
-#                 duplicate_to_original_mapping[sim_idx] = i
-#     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-#     """
-#     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-#     """
-#     # Update progress to indicate building the index
-#     progress(0, desc="Building search index from Dataset 1...")
-#     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-#     duplicate_indices_in_test = []
-#     duplicate_to_original_mapping = {}
-#     # Finding nearest neighbors between datasets
-#     progress(0, desc="Finding nearest neighbors between datasets...")
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix_2,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=True  # Allow internal progress bar
-#     )
-#     total_items = len(embedding_matrix_2)
-#     # Processing duplicates with a progress bar
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
-#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-#         if similar_indices:
-#             duplicate_indices_in_test.append(i)
-#             duplicate_to_original_mapping[i] = similar_indices[0]
-#     return duplicate_indices_in_test, duplicate_to_original_mapping
-# def display_word_differences(x: str, y: str) -> str:
-#     diff = ndiff(x.split(), y.split())
-#     return " ".join([word for word in diff if word.startswith(('+', '-'))])
-# def perform_deduplication(
-#     deduplication_type,
-#     dataset1_name,
-#     dataset1_split,
-#     dataset1_text_column,
-#     dataset2_name="",
-#     dataset2_split="",
-#     dataset2_text_column="",
-#     threshold=default_threshold,
-#     progress=gr.Progress(track_tqdm=True)
-# ):
-#     # Monkey-patch tqdm
-#     original_tqdm = tqdm.tqdm
-#     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
-#     tqdm.tqdm = progress.tqdm
-#     sys.modules['tqdm'].tqdm = progress.tqdm
-#     sys.modules['tqdm.auto'].tqdm = progress.tqdm
-#     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
-#     try:
-#         # Convert threshold to float
-#         threshold = float(threshold)
-#         if deduplication_type == "Single dataset":
-#             # Load Dataset 1
-#             progress(0, desc="Loading Dataset 1...")
-#             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-#                 ds = ds_default1
-#             else:
-#                 ds = load_dataset(dataset1_name, split=dataset1_split)
-#             # Extract texts
-#             progress(0, desc="Extracting texts from Dataset 1...")
-#             texts = [example[dataset1_text_column] for example in ds]
-#             # Compute embeddings
-#             progress(0, desc="Computing embeddings for Dataset 1...")
-#             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
-#             # Deduplicate
-#             result_text = deduplicate_and_prepare_results_single(
-#                 embedding_matrix, texts, threshold, progress
-#             )
-#             return result_text
-#         elif deduplication_type == "Cross-dataset":
-#             # Load Dataset 1
-#             progress(0, desc="Loading Dataset 1...")
-#             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-#                 ds1 = ds_default1
-#             else:
-#                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-#             # Load Dataset 2
-#             progress(0, desc="Loading Dataset 2...")
-#             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
-#                 ds2 = ds_default2
-#             else:
-#                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-#             # Extract texts from Dataset 1
-#             progress(0, desc="Extracting texts from Dataset 1...")
-#             texts1 = [example[dataset1_text_column] for example in ds1]
-#             # Extract texts from Dataset 2
-#             progress(0, desc="Extracting texts from Dataset 2...")
-#             texts2 = [example[dataset2_text_column] for example in ds2]
-#             # Compute embeddings for Dataset 1
-#             progress(0, desc="Computing embeddings for Dataset 1...")
-#             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
-#             # Compute embeddings for Dataset 2
-#             progress(0, desc="Computing embeddings for Dataset 2...")
-#             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
-#             # Deduplicate across datasets
-#             result_text = deduplicate_and_prepare_results_cross(
-#                 embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
-#             )
-#             return result_text
-#     finally:
-#         # Restore original tqdm
-#         tqdm.tqdm = original_tqdm
-#         sys.modules['tqdm'].tqdm = original_tqdm
-#         sys.modules['tqdm.auto'].tqdm = original_tqdm
-#         # Restore reach's original tqdm
-#         if original_reach_tqdm is not None:
-#             Reach.tqdm = original_reach_tqdm
-#         else:
-#             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
-# def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
-#     # Deduplicate
-#     deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-#         embedding_matrix, threshold, progress=progress
-#     )
-#     # Prepare the results
-#     num_duplicates = len(duplicate_to_original_mapping)
-#     num_total = len(texts)
-#     num_deduplicated = len(deduplicated_indices)
-#     result_text = f"**Total documents:** {num_total}\n"
-#     result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-#     result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-#     # Show deduplicated examples
-#     if num_duplicates > 0:
-#         result_text += "**Examples of duplicates found:**\n\n"
-#         num_examples = min(5, num_duplicates)
-#         for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-#             original_text = texts[original_idx]
-#             duplicate_text = texts[duplicate_idx]
-#             differences = display_word_differences(original_text, duplicate_text)
-#             result_text += f"**Original text:**\n{original_text}\n\n"
-#             result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-#             result_text += f"**Differences:**\n{differences}\n"
-#             result_text += "-" * 50 + "\n\n"
-#     else:
-#         result_text += "No duplicates found."
-#     return result_text
-# def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
-#     # Deduplicate across datasets
-#     duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-#         embedding_matrix1, embedding_matrix2, threshold, progress=progress
-#     )
-#     num_duplicates = len(duplicate_indices_in_ds2)
-#     num_total_ds2 = len(texts2)
-#     num_unique_ds2 = num_total_ds2 - num_duplicates
-#     result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-#     result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-#     result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-#     # Show deduplicated examples
-#     if num_duplicates > 0:
-#         result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-#         num_examples = min(5, num_duplicates)
-#         for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-#             original_idx = duplicate_to_original_mapping[duplicate_idx]
-#             original_text = texts1[original_idx]
-#             duplicate_text = texts2[duplicate_idx]
-#             differences = display_word_differences(original_text, duplicate_text)
-#             result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-#             result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-#             result_text += f"**Differences:**\n{differences}\n"
-#             result_text += "-" * 50 + "\n\n"
-#     else:
-#         result_text += "No duplicates found."
-#     return result_text
-# with gr.Blocks() as demo:
-#     gr.Markdown("# Semantic Deduplication")
-#     deduplication_type = gr.Radio(
-#         choices=["Single dataset", "Cross-dataset"],
-#         label="Deduplication Type",
-#         value="Single dataset"
-#     )
-#     with gr.Row():
-#         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-#         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-#         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-#     dataset2_inputs = gr.Column(visible=False)
-#     with dataset2_inputs:
-#         gr.Markdown("### Dataset 2")
-#         with gr.Row():
-#             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-#             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-#             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-#     threshold = gr.Slider(
-#         minimum=0.0,
-#         maximum=1.0,
-#         value=default_threshold,
-#         label="Similarity Threshold"
-#     )
-#     compute_button = gr.Button("Compute")
-#     output = gr.Markdown()
-#     # Function to update the visibility of dataset2_inputs
-#     def update_visibility(deduplication_type_value):
-#         if deduplication_type_value == "Cross-dataset":
-#             return gr.update(visible=True)
-#         else:
-#             return gr.update(visible=False)
-#     deduplication_type.change(
-#         update_visibility,
-#         inputs=deduplication_type,
-#         outputs=dataset2_inputs
-#     )
-#     compute_button.click(
-#         fn=perform_deduplication,
-#         inputs=[
-#             deduplication_type,
-#             dataset1_name,
-#             dataset1_split,
-#             dataset1_text_column,
-#             dataset2_name,
-#             dataset2_split,
-#             dataset2_text_column,
-#             threshold
-#         ],
-#         outputs=output
-#     )
-# demo.launch()
-# import gradio as gr
-# from datasets import load_dataset
-# import numpy as np
-# from model2vec import StaticModel
-# from reach import Reach
-# from difflib import ndiff
-# import sys
-# import tqdm
-# # Load the model at startup
-# model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # Load the default datasets at startup
-# default_dataset1_name = "ag_news"
-# default_dataset1_split = "train"
-# default_dataset2_name = "ag_news"
-# default_dataset2_split = "test"
-# ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-#     """
-#     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-#     """
-#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-#     deduplicated_indices = set(range(len(embedding_matrix)))
-#     duplicate_to_original_mapping = {}
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=True  # Allow internal progress bar
-#     )
-#     # Process duplicates
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
-#         if i not in deduplicated_indices:
-#             continue
-#         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-#         for sim_idx in similar_indices:
-#             if sim_idx in deduplicated_indices:
-#                 deduplicated_indices.remove(sim_idx)
-#                 duplicate_to_original_mapping[sim_idx] = i
-#     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-#     """
-#     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-#     """
-#     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-#     duplicate_indices_in_test = []
-#     duplicate_to_original_mapping = {}
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix_2,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=True  # Allow internal progress bar
-#     )
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
-#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-#         if similar_indices:
-#             duplicate_indices_in_test.append(i)
-#             duplicate_to_original_mapping[i] = similar_indices[0]
-#     return duplicate_indices_in_test, duplicate_to_original_mapping
-# def display_word_differences(x: str, y: str) -> str:
-#     diff = ndiff(x.split(), y.split())
-#     return " ".join([word for word in diff if word.startswith(('+', '-'))])
-# def perform_deduplication(
-#     deduplication_type,
-#     dataset1_name,
-#     dataset1_split,
-#     dataset1_text_column,
-#     dataset2_name="",
-#     dataset2_split="",
-#     dataset2_text_column="",
-#     threshold=0.8,
-#     progress=gr.Progress(track_tqdm=True)
-# ):
-#     # Monkey-patch tqdm
-#     original_tqdm = tqdm.tqdm
-#     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
-#     tqdm.tqdm = progress.tqdm
-#     sys.modules['tqdm'].tqdm = progress.tqdm
-#     sys.modules['tqdm.auto'].tqdm = progress.tqdm
-#     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
-#     try:
-#         # Convert threshold to float
-#         threshold = float(threshold)
-#         if deduplication_type == "Single dataset":
-#             # Check if the dataset is the default one
-#             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-#                 ds = ds_default1
-#             else:
-#                 ds = load_dataset(dataset1_name, split=dataset1_split)
-#             # Extract texts
-#             texts = [example[dataset1_text_column] for example in ds]
-#             # Compute embeddings
-#             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
-#             # Deduplicate
-#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
-#             # Prepare the results
-#             num_duplicates = len(duplicate_to_original_mapping)
-#             num_total = len(texts)
-#             num_deduplicated = len(deduplicated_indices)
-#             result_text = f"**Total documents:** {num_total}\n"
-#             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-#             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-#             # Show deduplicated examples
-#             result_text += "**Examples of duplicates found:**\n\n"
-#             num_examples = min(5, num_duplicates)
-#             for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-#                 original_text = texts[original_idx]
-#                 duplicate_text = texts[duplicate_idx]
-#                 differences = display_word_differences(original_text, duplicate_text)
-#                 result_text += f"**Original text:**\n{original_text}\n\n"
-#                 result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-#                 result_text += f"**Differences:**\n{differences}\n"
-#                 result_text += "-" * 50 + "\n\n"
-#             return result_text
-#         elif deduplication_type == "Cross-dataset":
-#             # Dataset 1
-#             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-#                 ds1 = ds_default1
-#             else:
-#                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-#             # Dataset 2
-#             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
-#                 ds2 = ds_default2
-#             else:
-#                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-#             # Extract texts
-#             texts1 = [example[dataset1_text_column] for example in ds1]
-#             texts2 = [example[dataset2_text_column] for example in ds2]
-#             # Compute embeddings
-#             embedding_matrix1 = model.encode(texts1, show_progressbar=True)  # Enable internal progress bar
-#             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
-#             # Deduplicate across datasets
-#             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-#                 embedding_matrix1, embedding_matrix2, threshold, progress=progress)
-#             num_duplicates = len(duplicate_indices_in_ds2)
-#             num_total_ds2 = len(texts2)
-#             num_unique_ds2 = num_total_ds2 - num_duplicates
-#             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-#             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-#             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-#             # Show deduplicated examples
-#             result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-#             num_examples = min(5, num_duplicates)
-#             for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-#                 original_idx = duplicate_to_original_mapping[duplicate_idx]
-#                 original_text = texts1[original_idx]
-#                 duplicate_text = texts2[duplicate_idx]
-#                 differences = display_word_differences(original_text, duplicate_text)
-#                 result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-#                 result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-#                 result_text += f"**Differences:**\n{differences}\n"
-#                 result_text += "-" * 50 + "\n\n"
-#             return result_text
-#     finally:
-#         # Restore original tqdm
-#         tqdm.tqdm = original_tqdm
-#         sys.modules['tqdm'].tqdm = original_tqdm
-#         sys.modules['tqdm.auto'].tqdm = original_tqdm
-#         # Restore reach's original tqdm
-#         if original_reach_tqdm is not None:
-#             Reach.tqdm = original_reach_tqdm
-#         else:
-#             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
-# with gr.Blocks() as demo:
-#     gr.Markdown("# Semantic Deduplication")
-#     deduplication_type = gr.Radio(
-#         choices=["Single dataset", "Cross-dataset"],
-#         label="Deduplication Type",
-#         value="Single dataset"
-#     )
-#     with gr.Row():
-#         dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
-#         dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
-#         dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
-#     dataset2_inputs = gr.Column(visible=False)
-#     with dataset2_inputs:
-#         gr.Markdown("### Dataset 2")
-#         with gr.Row():
-#             dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
-#             dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
-#             dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
-#     threshold = gr.Slider(
-#         minimum=0.0,
-#         maximum=1.0,
-#         value=0.8,
-#         label="Similarity Threshold"
-#     )
-#     compute_button = gr.Button("Compute")
-#     output = gr.Markdown()
-#     # Function to update the visibility of dataset2_inputs
-#     def update_visibility(deduplication_type_value):
-#         if deduplication_type_value == "Cross-dataset":
-#             return gr.update(visible=True)
-#         else:
-#             return gr.update(visible=False)
-#     deduplication_type.change(
-#         update_visibility,
-#         inputs=deduplication_type,
-#         outputs=dataset2_inputs
-#     )
-#     compute_button.click(
-#         fn=perform_deduplication,
-#         inputs=[
-#             deduplication_type,
-#             dataset1_name,
-#             dataset1_split,
-#             dataset1_text_column,
-#             dataset2_name,
-#             dataset2_split,
-#             dataset2_text_column,
-#             threshold
-#         ],
-#         outputs=output
-#     )
-# demo.launch()
 # # import gradio as gr
@@ -1253,7 +1334,7 @@ demo.launch()
 # #     )
 # #     # Process duplicates
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
 # #         if i not in deduplicated_indices:
 # #             continue
@@ -1282,8 +1363,7 @@ demo.launch()
 # #         show_progressbar=True  # Allow internal progress bar
 # #     )
-# #     # Process duplicates
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
 # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 # #         if similar_indices:
@@ -1309,9 +1389,11 @@ demo.launch()
 # # ):
 # #     # Monkey-patch tqdm
 # #     original_tqdm = tqdm.tqdm
 # #     tqdm.tqdm = progress.tqdm
 # #     sys.modules['tqdm'].tqdm = progress.tqdm
 # #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
 # #     try:
 # #         # Convert threshold to float
@@ -1378,7 +1460,8 @@ demo.launch()
 # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
 # #             # Deduplicate across datasets
-# #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 # #             num_duplicates = len(duplicate_indices_in_ds2)
 # #             num_total_ds2 = len(texts2)
@@ -1409,6 +1492,12 @@ demo.launch()
 # #         sys.modules['tqdm'].tqdm = original_tqdm
 # #         sys.modules['tqdm.auto'].tqdm = original_tqdm
 # # with gr.Blocks() as demo:
 # #     gr.Markdown("# Semantic Deduplication")
@@ -1471,3 +1560,261 @@ demo.launch()
 # #     )
 # # demo.launch()

     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
     return " ".join([word for word in diff if word.startswith(('+', '-'))])
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
+            embeddings = []
+            batch_size = 64
+            total_batches = (len(texts) + batch_size - 1) // batch_size
+            for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings", total=total_batches):
+                batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+                embeddings.append(batch_embeddings)
+            embedding_matrix = np.concatenate(embeddings, axis=0)
             # Deduplicate
             status = "Deduplicating embeddings..."
             # Compute embeddings for Dataset 1
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
+            embeddings1 = []
+            batch_size = 64
+            total_batches1 = (len(texts1) + batch_size - 1) // batch_size
+            for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
+                batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+                embeddings1.append(batch_embeddings)
+            embedding_matrix1 = np.concatenate(embeddings1, axis=0)
             # Compute embeddings for Dataset 2
             status = "Computing embeddings for Dataset 2..."
             yield status, ""
+            embeddings2 = []
+            total_batches2 = (len(texts2) + batch_size - 1) // batch_size
+            for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
+                batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+                embeddings2.append(batch_embeddings)
+            embedding_matrix2 = np.concatenate(embeddings2, axis=0)
             # Deduplicate across datasets
             status = "Deduplicating embeddings across datasets..."
         yield f"An error occurred: {e}", ""
         raise e
+def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+    """
+    Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+    """
+    # Building the index
+    progress(0, desc="Building search index...")
+    reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+    deduplicated_indices = set(range(len(embedding_matrix)))
+    duplicate_to_original_mapping = {}
+    # Finding nearest neighbors
+    progress(0, desc="Finding nearest neighbors...")
+    results = reach.nearest_neighbor_threshold(
+        embedding_matrix,
+        threshold=threshold,
+        batch_size=batch_size,
+        show_progressbar=False  # Disable internal progress bar
+    )
+    # Processing duplicates with a progress bar
+    total_items = len(embedding_matrix)
+    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
+        if i not in deduplicated_indices:
+            continue
+        similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+        for sim_idx in similar_indices:
+            if sim_idx in deduplicated_indices:
+                deduplicated_indices.remove(sim_idx)
+                duplicate_to_original_mapping[sim_idx] = i
+    return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+    """
+    Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+    """
+    # Building the index from Dataset 1
+    progress(0, desc="Building search index from Dataset 1...")
+    reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+    duplicate_indices_in_test = []
+    duplicate_to_original_mapping = {}
+    # Finding nearest neighbors between datasets
+    progress(0, desc="Finding nearest neighbors between datasets...")
+    results = reach.nearest_neighbor_threshold(
+        embedding_matrix_2,
+        threshold=threshold,
+        batch_size=batch_size,
+        show_progressbar=False  # Disable internal progress bar
+    )
+    total_items = len(embedding_matrix_2)
+    # Processing duplicates with a progress bar
+    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
+        similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+        if similar_indices:
+            duplicate_indices_in_test.append(i)
+            duplicate_to_original_mapping[i] = similar_indices[0]
+    return duplicate_indices_in_test, duplicate_to_original_mapping
 with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 # from model2vec import StaticModel
 # from reach import Reach
 # from difflib import ndiff
 # import tqdm
 # # Load the model at startup
 # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# def batch_iterable(iterable, batch_size):
+#     """Helper function to create batches from an iterable."""
+#     for i in range(0, len(iterable), batch_size):
+#         yield iterable[i:i + batch_size]
+# def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+#     embeddings = []
+#     for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
+#         batch_embeddings = model.encode(batch, show_progressbar=False)
+#         embeddings.append(batch_embeddings)
+#     return np.concatenate(embeddings, axis=0)
+# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
 #     """
 #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
 #     """
 #     # Building the index
+#     progress(0, desc="Building search index...")
 #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
 #     # Finding nearest neighbors
+#     progress(0, desc="Finding nearest neighbors...")
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=False  # Disable internal progress bar
 #     )
+#     # Processing duplicates with a progress bar
+#     total_items = len(embedding_matrix)
+#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
 #         if i not in deduplicated_indices:
 #             continue
 #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
 #     """
 #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
 #     """
 #     # Building the index from Dataset 1
+#     progress(0, desc="Building search index from Dataset 1...")
 #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 #     duplicate_indices_in_test = []
 #     duplicate_to_original_mapping = {}
 #     # Finding nearest neighbors between datasets
+#     progress(0, desc="Finding nearest neighbors between datasets...")
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix_2,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=False  # Disable internal progress bar
 #     )
+#     total_items = len(embedding_matrix_2)
+#     # Processing duplicates with a progress bar
+#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
 #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 #         if similar_indices:
 #     threshold=default_threshold,
 #     progress=gr.Progress(track_tqdm=True)
 # ):
 #     try:
 #         # Convert threshold to float
 #         threshold = float(threshold)
 #         # Initialize status message
 #         status = ""
 #                 ds = ds_default1
 #             else:
 #                 ds = load_dataset(dataset1_name, split=dataset1_split)
 #             # Extract texts
 #             status = "Extracting texts from Dataset 1..."
 #             yield status, ""
 #             texts = [example[dataset1_text_column] for example in ds]
 #             # Compute embeddings
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
+#             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 #             # Deduplicate
 #             status = "Deduplicating embeddings..."
 #             yield status, ""
 #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+#                 embedding_matrix, threshold, progress=progress
 #             )
 #             # Prepare the results
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
 #             result_text = f"**Total documents:** {num_total}\n"
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 #             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
 #                     result_text += "-" * 50 + "\n\n"
 #             else:
 #                 result_text += "No duplicates found."
 #             # Final status
 #             status = "Deduplication completed."
 #             yield status, result_text
 #         elif deduplication_type == "Cross-dataset":
 #             # Load Dataset 1
 #             status = "Loading Dataset 1..."
 #                 ds1 = ds_default1
 #             else:
 #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
 #             # Load Dataset 2
 #             status = "Loading Dataset 2..."
 #             yield status, ""
 #                 ds2 = ds_default2
 #             else:
 #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
 #             # Extract texts from Dataset 1
 #             status = "Extracting texts from Dataset 1..."
 #             yield status, ""
 #             texts1 = [example[dataset1_text_column] for example in ds1]
 #             # Extract texts from Dataset 2
 #             status = "Extracting texts from Dataset 2..."
 #             yield status, ""
 #             texts2 = [example[dataset2_text_column] for example in ds2]
 #             # Compute embeddings for Dataset 1
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
+#             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 #             # Compute embeddings for Dataset 2
 #             status = "Computing embeddings for Dataset 2..."
 #             yield status, ""
+#             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
 #             # Deduplicate across datasets
 #             status = "Deduplicating embeddings across datasets..."
 #             yield status, ""
 #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+#                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
 #             )
 #             num_duplicates = len(duplicate_indices_in_ds2)
 #             num_total_ds2 = len(texts2)
 #             num_unique_ds2 = num_total_ds2 - num_duplicates
+#             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n\n"
+#             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 #             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 #                     result_text += "-" * 50 + "\n\n"
 #             else:
 #                 result_text += "No duplicates found."
 #             # Final status
 #             status = "Deduplication completed."
 #             yield status, result_text
+#     except Exception as e:
+#         yield f"An error occurred: {e}", ""
+#         raise e
 # with gr.Blocks() as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #     compute_button.click(
 #         fn=perform_deduplication,
 #         inputs=[
+#             deduplication_type,
+#             dataset1_name,
+#             dataset1_split,
 #             dataset1_text_column,
+#             dataset2_name,
+#             dataset2_split,
 #             dataset2_text_column,
 #             threshold
 #         ],
 #         outputs=[status_output, result_output]
 #     )
 # demo.launch()
+# # import gradio as gr
+# # from datasets import load_dataset
+# # import numpy as np
+# # from model2vec import StaticModel
+# # from reach import Reach
+# # from difflib import ndiff
+# # import sys
+# # import tqdm
+# # # Load the model at startup
+# # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # Update default dataset to 'sst2' and set default threshold to 0.9
+# # default_dataset1_name = "sst2"
+# # default_dataset1_split = "train"
+# # default_dataset2_name = "sst2"
+# # default_dataset2_split = "validation"
+# # default_text_column = "sentence"
+# # default_threshold = 0.9
+# # # Load the default datasets at startup
+# # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
+# #     """
+# #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+# #     """
+# #     # Building the index
+# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+# #     deduplicated_indices = set(range(len(embedding_matrix)))
+# #     duplicate_to_original_mapping = {}
+# #     # Finding nearest neighbors
+# #     results = reach.nearest_neighbor_threshold(
+# #         embedding_matrix,
+# #         threshold=threshold,
+# #         batch_size=batch_size,
+# #         show_progressbar=True  # Allow internal progress bar
+# #     )
+# #     # Processing duplicates
+# #     for i, similar_items in enumerate(results):
+# #         if i not in deduplicated_indices:
+# #             continue
+# #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# #         for sim_idx in similar_indices:
+# #             if sim_idx in deduplicated_indices:
+# #                 deduplicated_indices.remove(sim_idx)
+# #                 duplicate_to_original_mapping[sim_idx] = i
+# #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
+# #     """
+# #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+# #     """
+# #     # Building the index from Dataset 1
+# #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+# #     duplicate_indices_in_test = []
+# #     duplicate_to_original_mapping = {}
+# #     # Finding nearest neighbors between datasets
+# #     results = reach.nearest_neighbor_threshold(
+# #         embedding_matrix_2,
+# #         threshold=threshold,
+# #         batch_size=batch_size,
+# #         show_progressbar=True  # Allow internal progress bar
+# #     )
+# #     # Processing duplicates
+# #     for i, similar_items in enumerate(results):
+# #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# #         if similar_indices:
+# #             duplicate_indices_in_test.append(i)
+# #             duplicate_to_original_mapping[i] = similar_indices[0]
+# #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # def display_word_differences(x: str, y: str) -> str:
+# #     diff = ndiff(x.split(), y.split())
+# #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
+# # def perform_deduplication(
+# #     deduplication_type,
+# #     dataset1_name,
+# #     dataset1_split,
+# #     dataset1_text_column,
+# #     dataset2_name="",
+# #     dataset2_split="",
+# #     dataset2_text_column="",
+# #     threshold=default_threshold,
+# #     progress=gr.Progress(track_tqdm=True)
+# # ):
+# #     # Deep Monkey-Patching of tqdm
+# #     original_tqdm = tqdm.tqdm
+# #     tqdm.tqdm = progress.tqdm
+# #     for mod_name in list(sys.modules.keys()):
+# #         if 'tqdm' in mod_name:
+# #             sys.modules[mod_name].tqdm = progress.tqdm
+# #     try:
+# #         # Convert threshold to float
+# #         threshold = float(threshold)
+# #         # Initialize status message
+# #         status = ""
+# #         if deduplication_type == "Single dataset":
+# #             # Load Dataset 1
+# #             status = "Loading Dataset 1..."
+# #             yield status, ""
+# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# #                 ds = ds_default1
+# #             else:
+# #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# #             # Extract texts
+# #             status = "Extracting texts from Dataset 1..."
+# #             yield status, ""
+# #             texts = [example[dataset1_text_column] for example in ds]
+# #             # Compute embeddings
+# #             status = "Computing embeddings for Dataset 1..."
+# #             yield status, ""
+# #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
+# #             # Deduplicate
+# #             status = "Deduplicating embeddings..."
+# #             yield status, ""
+# #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# #                 embedding_matrix, threshold
+# #             )
+# #             # Prepare the results
+# #             num_duplicates = len(duplicate_to_original_mapping)
+# #             num_total = len(texts)
+# #             num_deduplicated = len(deduplicated_indices)
+# #             result_text = f"**Total documents:** {num_total}\n"
+# #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# #             # Show deduplicated examples
+# #             if num_duplicates > 0:
+# #                 result_text += "**Examples of duplicates found:**\n\n"
+# #                 num_examples = min(5, num_duplicates)
+# #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# #                     original_text = texts[original_idx]
+# #                     duplicate_text = texts[duplicate_idx]
+# #                     differences = display_word_differences(original_text, duplicate_text)
+# #                     result_text += f"**Original text:**\n{original_text}\n\n"
+# #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# #                     result_text += f"**Differences:**\n{differences}\n"
+# #                     result_text += "-" * 50 + "\n\n"
+# #             else:
+# #                 result_text += "No duplicates found."
+# #             # Final status
+# #             status = "Deduplication completed."
+# #             yield status, result_text
+# #         elif deduplication_type == "Cross-dataset":
+# #             # Load Dataset 1
+# #             status = "Loading Dataset 1..."
+# #             yield status, ""
+# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# #                 ds1 = ds_default1
+# #             else:
+# #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# #             # Load Dataset 2
+# #             status = "Loading Dataset 2..."
+# #             yield status, ""
+# #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+# #                 ds2 = ds_default2
+# #             else:
+# #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# #             # Extract texts from Dataset 1
+# #             status = "Extracting texts from Dataset 1..."
+# #             yield status, ""
+# #             texts1 = [example[dataset1_text_column] for example in ds1]
+# #             # Extract texts from Dataset 2
+# #             status = "Extracting texts from Dataset 2..."
+# #             yield status, ""
+# #             texts2 = [example[dataset2_text_column] for example in ds2]
+# #             # Compute embeddings for Dataset 1
+# #             status = "Computing embeddings for Dataset 1..."
+# #             yield status, ""
+# #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
+# #             # Compute embeddings for Dataset 2
+# #             status = "Computing embeddings for Dataset 2..."
+# #             yield status, ""
+# #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
+# #             # Deduplicate across datasets
+# #             status = "Deduplicating embeddings across datasets..."
+# #             yield status, ""
+# #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# #                 embedding_matrix1, embedding_matrix2, threshold
+# #             )
+# #             num_duplicates = len(duplicate_indices_in_ds2)
+# #             num_total_ds2 = len(texts2)
+# #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# #             # Show deduplicated examples
+# #             if num_duplicates > 0:
+# #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# #                 num_examples = min(5, num_duplicates)
+# #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
+# #                     original_text = texts1[original_idx]
+# #                     duplicate_text = texts2[duplicate_idx]
+# #                     differences = display_word_differences(original_text, duplicate_text)
+# #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# #                     result_text += f"**Differences:**\n{differences}\n"
+# #                     result_text += "-" * 50 + "\n\n"
+# #             else:
+# #                 result_text += "No duplicates found."
+# #             # Final status
+# #             status = "Deduplication completed."
+# #             yield status, result_text
+# #     finally:
+# #         # Restore original tqdm
+# #         tqdm.tqdm = original_tqdm
+# #         for mod_name in list(sys.modules.keys()):
+# #             if 'tqdm' in mod_name:
+# #                 sys.modules[mod_name].tqdm = original_tqdm
+# # with gr.Blocks() as demo:
+# #     gr.Markdown("# Semantic Deduplication")
+# #     deduplication_type = gr.Radio(
+# #         choices=["Single dataset", "Cross-dataset"],
+# #         label="Deduplication Type",
+# #         value="Single dataset"
+# #     )
+# #     with gr.Row():
+# #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+# #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+# #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# #     dataset2_inputs = gr.Column(visible=False)
+# #     with dataset2_inputs:
+# #         gr.Markdown("### Dataset 2")
+# #         with gr.Row():
+# #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+# #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+# #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# #     threshold = gr.Slider(
+# #         minimum=0.0,
+# #         maximum=1.0,
+# #         value=default_threshold,
+# #         label="Similarity Threshold"
+# #     )
+# #     compute_button = gr.Button("Compute")
+# #     status_output = gr.Markdown()
+# #     result_output = gr.Markdown()
+# #     # Function to update the visibility of dataset2_inputs
+# #     def update_visibility(deduplication_type_value):
+# #         if deduplication_type_value == "Cross-dataset":
+# #             return gr.update(visible=True)
+# #         else:
+# #             return gr.update(visible=False)
+# #     deduplication_type.change(
+# #         update_visibility,
+# #         inputs=deduplication_type,
+# #         outputs=dataset2_inputs
+# #     )
+# #     compute_button.click(
+# #         fn=perform_deduplication,
+# #         inputs=[
+# #             deduplication_type,
+# #             dataset1_name,
+# #             dataset1_split,
+# #             dataset1_text_column,
+# #             dataset2_name,
+# #             dataset2_split,
+# #             dataset2_text_column,
+# #             threshold
+# #         ],
+# #         outputs=[status_output, result_output]
+# #     )
+# # demo.launch()
+# # import gradio as gr
+# # from datasets import load_dataset
+# # import numpy as np
+# # from model2vec import StaticModel
+# # from reach import Reach
+# # from difflib import ndiff
+# # import sys
+# # import tqdm
+# # # Load the model at startup
+# # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # Update default dataset to 'sst2' and set default threshold to 0.9
+# # default_dataset1_name = "sst2"
+# # default_dataset1_split = "train"
+# # default_dataset2_name = "sst2"
+# # default_dataset2_split = "validation"
+# # default_text_column = "sentence"
+# # default_threshold = 0.9
+# # # Load the default datasets at startup
+# # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+# #     """
+# #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+# #     """
+# #     # Update progress to indicate building the index
+# #     progress(0, desc="Building search index...")
+# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+# #     deduplicated_indices = set(range(len(embedding_matrix)))
+# #     duplicate_to_original_mapping = {}
+# #     # Finding nearest neighbors
+# #     progress(0, desc="Finding nearest neighbors...")
+# #     results = reach.nearest_neighbor_threshold(
+# #         embedding_matrix,
+# #         threshold=threshold,
+# #         batch_size=batch_size,
+# #         show_progressbar=True  # Allow internal progress bar
+# #     )
+# #     # Processing duplicates with a progress bar
+# #     total_items = len(embedding_matrix)
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
+# #         if i not in deduplicated_indices:
+# #             continue
+# #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# #         for sim_idx in similar_indices:
+# #             if sim_idx in deduplicated_indices:
+# #                 deduplicated_indices.remove(sim_idx)
+# #                 duplicate_to_original_mapping[sim_idx] = i
+# #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+# #     """
+# #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+# #     """
+# #     # Update progress to indicate building the index
+# #     progress(0, desc="Building search index from Dataset 1...")
+# #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+# #     duplicate_indices_in_test = []
+# #     duplicate_to_original_mapping = {}
+# #     # Finding nearest neighbors between datasets
+# #     progress(0, desc="Finding nearest neighbors between datasets...")
+# #     results = reach.nearest_neighbor_threshold(
+# #         embedding_matrix_2,
+# #         threshold=threshold,
+# #         batch_size=batch_size,
+# #         show_progressbar=True  # Allow internal progress bar
+# #     )
+# #     total_items = len(embedding_matrix_2)
+# #     # Processing duplicates with a progress bar
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
+# #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# #         if similar_indices:
+# #             duplicate_indices_in_test.append(i)
+# #             duplicate_to_original_mapping[i] = similar_indices[0]
+# #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # def display_word_differences(x: str, y: str) -> str:
+# #     diff = ndiff(x.split(), y.split())
+# #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
+# # def perform_deduplication(
+# #     deduplication_type,
+# #     dataset1_name,
+# #     dataset1_split,
+# #     dataset1_text_column,
+# #     dataset2_name="",
+# #     dataset2_split="",
+# #     dataset2_text_column="",
+# #     threshold=default_threshold,
+# #     progress=gr.Progress(track_tqdm=True)
+# # ):
+# #     # Monkey-patch tqdm
+# #     original_tqdm = tqdm.tqdm
+# #     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
+# #     tqdm.tqdm = progress.tqdm
+# #     sys.modules['tqdm'].tqdm = progress.tqdm
+# #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
+# #     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
+# #     try:
+# #         # Convert threshold to float
+# #         threshold = float(threshold)
+# #         if deduplication_type == "Single dataset":
+# #             # Load Dataset 1
+# #             progress(0, desc="Loading Dataset 1...")
+# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# #                 ds = ds_default1
+# #             else:
+# #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# #             # Extract texts
+# #             progress(0, desc="Extracting texts from Dataset 1...")
+# #             texts = [example[dataset1_text_column] for example in ds]
+# #             # Compute embeddings
+# #             progress(0, desc="Computing embeddings for Dataset 1...")
+# #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
+# #             # Deduplicate
+# #             result_text = deduplicate_and_prepare_results_single(
+# #                 embedding_matrix, texts, threshold, progress
+# #             )
+# #             return result_text
+# #         elif deduplication_type == "Cross-dataset":
+# #             # Load Dataset 1
+# #             progress(0, desc="Loading Dataset 1...")
+# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# #                 ds1 = ds_default1
+# #             else:
+# #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# #             # Load Dataset 2
+# #             progress(0, desc="Loading Dataset 2...")
+# #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+# #                 ds2 = ds_default2
+# #             else:
+# #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# #             # Extract texts from Dataset 1
+# #             progress(0, desc="Extracting texts from Dataset 1...")
+# #             texts1 = [example[dataset1_text_column] for example in ds1]
+# #             # Extract texts from Dataset 2
+# #             progress(0, desc="Extracting texts from Dataset 2...")
+# #             texts2 = [example[dataset2_text_column] for example in ds2]
+# #             # Compute embeddings for Dataset 1
+# #             progress(0, desc="Computing embeddings for Dataset 1...")
+# #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
+# #             # Compute embeddings for Dataset 2
+# #             progress(0, desc="Computing embeddings for Dataset 2...")
+# #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
+# #             # Deduplicate across datasets
+# #             result_text = deduplicate_and_prepare_results_cross(
+# #                 embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
+# #             )
+# #             return result_text
+# #     finally:
+# #         # Restore original tqdm
+# #         tqdm.tqdm = original_tqdm
+# #         sys.modules['tqdm'].tqdm = original_tqdm
+# #         sys.modules['tqdm.auto'].tqdm = original_tqdm
+# #         # Restore reach's original tqdm
+# #         if original_reach_tqdm is not None:
+# #             Reach.tqdm = original_reach_tqdm
+# #         else:
+# #             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
+# # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
+# #     # Deduplicate
+# #     deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# #         embedding_matrix, threshold, progress=progress
+# #     )
+# #     # Prepare the results
+# #     num_duplicates = len(duplicate_to_original_mapping)
+# #     num_total = len(texts)
+# #     num_deduplicated = len(deduplicated_indices)
+# #     result_text = f"**Total documents:** {num_total}\n"
+# #     result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# #     result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# #     # Show deduplicated examples
+# #     if num_duplicates > 0:
+# #         result_text += "**Examples of duplicates found:**\n\n"
+# #         num_examples = min(5, num_duplicates)
+# #         for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# #             original_text = texts[original_idx]
+# #             duplicate_text = texts[duplicate_idx]
+# #             differences = display_word_differences(original_text, duplicate_text)
+# #             result_text += f"**Original text:**\n{original_text}\n\n"
+# #             result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# #             result_text += f"**Differences:**\n{differences}\n"
+# #             result_text += "-" * 50 + "\n\n"
+# #     else:
+# #         result_text += "No duplicates found."
+# #     return result_text
+# # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
+# #     # Deduplicate across datasets
+# #     duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# #         embedding_matrix1, embedding_matrix2, threshold, progress=progress
+# #     )
+# #     num_duplicates = len(duplicate_indices_in_ds2)
+# #     num_total_ds2 = len(texts2)
+# #     num_unique_ds2 = num_total_ds2 - num_duplicates
+# #     result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# #     result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# #     result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# #     # Show deduplicated examples
+# #     if num_duplicates > 0:
+# #         result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# #         num_examples = min(5, num_duplicates)
+# #         for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# #             original_idx = duplicate_to_original_mapping[duplicate_idx]
+# #             original_text = texts1[original_idx]
+# #             duplicate_text = texts2[duplicate_idx]
+# #             differences = display_word_differences(original_text, duplicate_text)
+# #             result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# #             result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# #             result_text += f"**Differences:**\n{differences}\n"
+# #             result_text += "-" * 50 + "\n\n"
+# #     else:
+# #         result_text += "No duplicates found."
+# #     return result_text
+# # with gr.Blocks() as demo:
+# #     gr.Markdown("# Semantic Deduplication")
+# #     deduplication_type = gr.Radio(
+# #         choices=["Single dataset", "Cross-dataset"],
+# #         label="Deduplication Type",
+# #         value="Single dataset"
+# #     )
+# #     with gr.Row():
+# #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+# #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+# #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# #     dataset2_inputs = gr.Column(visible=False)
+# #     with dataset2_inputs:
+# #         gr.Markdown("### Dataset 2")
+# #         with gr.Row():
+# #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+# #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+# #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# #     threshold = gr.Slider(
+# #         minimum=0.0,
+# #         maximum=1.0,
+# #         value=default_threshold,
+# #         label="Similarity Threshold"
+# #     )
+# #     compute_button = gr.Button("Compute")
+# #     output = gr.Markdown()
+# #     # Function to update the visibility of dataset2_inputs
+# #     def update_visibility(deduplication_type_value):
+# #         if deduplication_type_value == "Cross-dataset":
+# #             return gr.update(visible=True)
+# #         else:
+# #             return gr.update(visible=False)
+# #     deduplication_type.change(
+# #         update_visibility,
+# #         inputs=deduplication_type,
+# #         outputs=dataset2_inputs
+# #     )
+# #     compute_button.click(
+# #         fn=perform_deduplication,
+# #         inputs=[
+# #             deduplication_type,
+# #             dataset1_name,
+# #             dataset1_split,
+# #             dataset1_text_column,
+# #             dataset2_name,
+# #             dataset2_split,
+# #             dataset2_text_column,
+# #             threshold
+# #         ],
+# #         outputs=output
+# #     )
+# # demo.launch()
 # # import gradio as gr
 # #     )
 # #     # Process duplicates
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
 # #         if i not in deduplicated_indices:
 # #             continue
 # #         show_progressbar=True  # Allow internal progress bar
 # #     )
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
 # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 # #         if similar_indices:
 # # ):
 # #     # Monkey-patch tqdm
 # #     original_tqdm = tqdm.tqdm
+# #     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
 # #     tqdm.tqdm = progress.tqdm
 # #     sys.modules['tqdm'].tqdm = progress.tqdm
 # #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
+# #     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
 # #     try:
 # #         # Convert threshold to float
 # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
 # #             # Deduplicate across datasets
+# #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 # #             num_duplicates = len(duplicate_indices_in_ds2)
 # #             num_total_ds2 = len(texts2)
 # #         sys.modules['tqdm'].tqdm = original_tqdm
 # #         sys.modules['tqdm.auto'].tqdm = original_tqdm
+# #         # Restore reach's original tqdm
+# #         if original_reach_tqdm is not None:
+# #             Reach.tqdm = original_reach_tqdm
+# #         else:
+# #             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
 # # with gr.Blocks() as demo:
 # #     gr.Markdown("# Semantic Deduplication")
 # #     )
 # # demo.launch()
+# # # import gradio as gr
+# # # from datasets import load_dataset
+# # # import numpy as np
+# # # from model2vec import StaticModel
+# # # from reach import Reach
+# # # from difflib import ndiff
+# # # import sys
+# # # import tqdm
+# # # # Load the model at startup
+# # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # # Load the default datasets at startup
+# # # default_dataset1_name = "ag_news"
+# # # default_dataset1_split = "train"
+# # # default_dataset2_name = "ag_news"
+# # # default_dataset2_split = "test"
+# # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+# # #     """
+# # #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+# # #     """
+# # #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+# # #     deduplicated_indices = set(range(len(embedding_matrix)))
+# # #     duplicate_to_original_mapping = {}
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=True  # Allow internal progress bar
+# # #     )
+# # #     # Process duplicates
+# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
+# # #         if i not in deduplicated_indices:
+# # #             continue
+# # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# # #         for sim_idx in similar_indices:
+# # #             if sim_idx in deduplicated_indices:
+# # #                 deduplicated_indices.remove(sim_idx)
+# # #                 duplicate_to_original_mapping[sim_idx] = i
+# # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+# # #     """
+# # #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+# # #     """
+# # #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+# # #     duplicate_indices_in_test = []
+# # #     duplicate_to_original_mapping = {}
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix_2,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=True  # Allow internal progress bar
+# # #     )
+# # #     # Process duplicates
+# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
+# # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# # #         if similar_indices:
+# # #             duplicate_indices_in_test.append(i)
+# # #             duplicate_to_original_mapping[i] = similar_indices[0]
+# # #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # # def display_word_differences(x: str, y: str) -> str:
+# # #     diff = ndiff(x.split(), y.split())
+# # #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
+# # # def perform_deduplication(
+# # #     deduplication_type,
+# # #     dataset1_name,
+# # #     dataset1_split,
+# # #     dataset1_text_column,
+# # #     dataset2_name="",
+# # #     dataset2_split="",
+# # #     dataset2_text_column="",
+# # #     threshold=0.8,
+# # #     progress=gr.Progress(track_tqdm=True)
+# # # ):
+# # #     # Monkey-patch tqdm
+# # #     original_tqdm = tqdm.tqdm
+# # #     tqdm.tqdm = progress.tqdm
+# # #     sys.modules['tqdm'].tqdm = progress.tqdm
+# # #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
+# # #     try:
+# # #         # Convert threshold to float
+# # #         threshold = float(threshold)
+# # #         if deduplication_type == "Single dataset":
+# # #             # Check if the dataset is the default one
+# # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # #                 ds = ds_default1
+# # #             else:
+# # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Extract texts
+# # #             texts = [example[dataset1_text_column] for example in ds]
+# # #             # Compute embeddings
+# # #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
+# # #             # Deduplicate
+# # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
+# # #             # Prepare the results
+# # #             num_duplicates = len(duplicate_to_original_mapping)
+# # #             num_total = len(texts)
+# # #             num_deduplicated = len(deduplicated_indices)
+# # #             result_text = f"**Total documents:** {num_total}\n"
+# # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# # #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# # #             # Show deduplicated examples
+# # #             result_text += "**Examples of duplicates found:**\n\n"
+# # #             num_examples = min(5, num_duplicates)
+# # #             for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# # #                 original_text = texts[original_idx]
+# # #                 duplicate_text = texts[duplicate_idx]
+# # #                 differences = display_word_differences(original_text, duplicate_text)
+# # #                 result_text += f"**Original text:**\n{original_text}\n\n"
+# # #                 result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# # #                 result_text += f"**Differences:**\n{differences}\n"
+# # #                 result_text += "-" * 50 + "\n\n"
+# # #             return result_text
+# # #         elif deduplication_type == "Cross-dataset":
+# # #             # Dataset 1
+# # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # #                 ds1 = ds_default1
+# # #             else:
+# # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Dataset 2
+# # #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+# # #                 ds2 = ds_default2
+# # #             else:
+# # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# # #             # Extract texts
+# # #             texts1 = [example[dataset1_text_column] for example in ds1]
+# # #             texts2 = [example[dataset2_text_column] for example in ds2]
+# # #             # Compute embeddings
+# # #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)  # Enable internal progress bar
+# # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
+# # #             # Deduplicate across datasets
+# # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
+# # #             num_duplicates = len(duplicate_indices_in_ds2)
+# # #             num_total_ds2 = len(texts2)
+# # #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# # #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# # #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# # #             # Show deduplicated examples
+# # #             result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# # #             num_examples = min(5, num_duplicates)
+# # #             for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# # #                 original_idx = duplicate_to_original_mapping[duplicate_idx]
+# # #                 original_text = texts1[original_idx]
+# # #                 duplicate_text = texts2[duplicate_idx]
+# # #                 differences = display_word_differences(original_text, duplicate_text)
+# # #                 result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# # #                 result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# # #                 result_text += f"**Differences:**\n{differences}\n"
+# # #                 result_text += "-" * 50 + "\n\n"
+# # #             return result_text
+# # #     finally:
+# # #         # Restore original tqdm
+# # #         tqdm.tqdm = original_tqdm
+# # #         sys.modules['tqdm'].tqdm = original_tqdm
+# # #         sys.modules['tqdm.auto'].tqdm = original_tqdm
+# # # with gr.Blocks() as demo:
+# # #     gr.Markdown("# Semantic Deduplication")
+# # #     deduplication_type = gr.Radio(
+# # #         choices=["Single dataset", "Cross-dataset"],
+# # #         label="Deduplication Type",
+# # #         value="Single dataset"
+# # #     )
+# # #     with gr.Row():
+# # #         dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
+# # #         dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
+# # #         dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
+# # #     dataset2_inputs = gr.Column(visible=False)
+# # #     with dataset2_inputs:
+# # #         gr.Markdown("### Dataset 2")
+# # #         with gr.Row():
+# # #             dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
+# # #             dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
+# # #             dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
+# # #     threshold = gr.Slider(
+# # #         minimum=0.0,
+# # #         maximum=1.0,
+# # #         value=0.8,
+# # #         label="Similarity Threshold"
+# # #     )
+# # #     compute_button = gr.Button("Compute")
+# # #     output = gr.Markdown()
+# # #     # Function to update the visibility of dataset2_inputs
+# # #     def update_visibility(deduplication_type_value):
+# # #         if deduplication_type_value == "Cross-dataset":
+# # #             return gr.update(visible=True)
+# # #         else:
+# # #             return gr.update(visible=False)
+# # #     deduplication_type.change(
+# # #         update_visibility,
+# # #         inputs=deduplication_type,
+# # #         outputs=dataset2_inputs
+# # #     )
+# # #     compute_button.click(
+# # #         fn=perform_deduplication,
+# # #         inputs=[
+# # #             deduplication_type,
+# # #             dataset1_name,
+# # #             dataset1_split,
+# # #             dataset1_text_column,
+# # #             dataset2_name,
+# # #             dataset2_split,
+# # #             dataset2_text_column,
+# # #             threshold
+# # #         ],
+# # #         outputs=output
+# # #     )
+# # # demo.launch()