Pringled commited on
Commit
73b7a75
·
1 Parent(s): 3bd0812

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +16 -14
app.py CHANGED
@@ -6,6 +6,7 @@ from reach import Reach
6
  from difflib import ndiff
7
  import sys
8
  import tqdm
 
9
 
10
  # Load the model at startup
11
  model = StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -26,13 +27,13 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
26
  """
27
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
28
  """
29
- # Build the index
30
  reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
31
 
32
  deduplicated_indices = set(range(len(embedding_matrix)))
33
  duplicate_to_original_mapping = {}
34
 
35
- # Find nearest neighbors
36
  results = reach.nearest_neighbor_threshold(
37
  embedding_matrix,
38
  threshold=threshold,
@@ -40,7 +41,7 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
40
  show_progressbar=True # Allow internal progress bar
41
  )
42
 
43
- # Process duplicates
44
  for i, similar_items in enumerate(results):
45
  if i not in deduplicated_indices:
46
  continue
@@ -58,13 +59,13 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
58
  """
59
  Deduplicate embeddings across two datasets and return the indices of duplicates between them.
60
  """
61
- # Build the index from Dataset 1
62
  reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
63
 
64
  duplicate_indices_in_test = []
65
  duplicate_to_original_mapping = {}
66
 
67
- # Find nearest neighbors between datasets
68
  results = reach.nearest_neighbor_threshold(
69
  embedding_matrix_2,
70
  threshold=threshold,
@@ -72,7 +73,7 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
72
  show_progressbar=True # Allow internal progress bar
73
  )
74
 
75
- # Process duplicates
76
  for i, similar_items in enumerate(results):
77
  similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
78
 
@@ -103,9 +104,9 @@ def perform_deduplication(
103
  super().__init__(*args, **kwargs)
104
 
105
  # Copy module-level attributes from original tqdm module
106
- TqdmWrapper.format_interval = staticmethod(tqdm.format_interval)
107
- TqdmWrapper.format_num = staticmethod(tqdm.format_num)
108
- TqdmWrapper.format_sizeof = staticmethod(tqdm.format_sizeof)
109
 
110
  # Monkey-patch tqdm.tqdm with our wrapper
111
  original_tqdm_tqdm = tqdm.tqdm
@@ -313,12 +314,12 @@ with gr.Blocks() as demo:
313
  compute_button.click(
314
  fn=perform_deduplication,
315
  inputs=[
316
- deduplication_type,
317
- dataset1_name,
318
- dataset1_split,
319
  dataset1_text_column,
320
- dataset2_name,
321
- dataset2_split,
322
  dataset2_text_column,
323
  threshold
324
  ],
@@ -328,6 +329,7 @@ with gr.Blocks() as demo:
328
  demo.launch()
329
 
330
 
 
331
  # import gradio as gr
332
  # from datasets import load_dataset
333
  # import numpy as np
 
6
  from difflib import ndiff
7
  import sys
8
  import tqdm
9
+ from tqdm.utils import format_interval, format_num, format_sizeof
10
 
11
  # Load the model at startup
12
  model = StaticModel.from_pretrained("minishlab/M2V_base_output")
 
27
  """
28
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
29
  """
30
+ # Building the index
31
  reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
32
 
33
  deduplicated_indices = set(range(len(embedding_matrix)))
34
  duplicate_to_original_mapping = {}
35
 
36
+ # Finding nearest neighbors
37
  results = reach.nearest_neighbor_threshold(
38
  embedding_matrix,
39
  threshold=threshold,
 
41
  show_progressbar=True # Allow internal progress bar
42
  )
43
 
44
+ # Processing duplicates
45
  for i, similar_items in enumerate(results):
46
  if i not in deduplicated_indices:
47
  continue
 
59
  """
60
  Deduplicate embeddings across two datasets and return the indices of duplicates between them.
61
  """
62
+ # Building the index from Dataset 1
63
  reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
64
 
65
  duplicate_indices_in_test = []
66
  duplicate_to_original_mapping = {}
67
 
68
+ # Finding nearest neighbors between datasets
69
  results = reach.nearest_neighbor_threshold(
70
  embedding_matrix_2,
71
  threshold=threshold,
 
73
  show_progressbar=True # Allow internal progress bar
74
  )
75
 
76
+ # Processing duplicates
77
  for i, similar_items in enumerate(results):
78
  similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
79
 
 
104
  super().__init__(*args, **kwargs)
105
 
106
  # Copy module-level attributes from original tqdm module
107
+ TqdmWrapper.format_interval = staticmethod(format_interval)
108
+ TqdmWrapper.format_num = staticmethod(format_num)
109
+ TqdmWrapper.format_sizeof = staticmethod(format_sizeof)
110
 
111
  # Monkey-patch tqdm.tqdm with our wrapper
112
  original_tqdm_tqdm = tqdm.tqdm
 
314
  compute_button.click(
315
  fn=perform_deduplication,
316
  inputs=[
317
+ deduplication_type,
318
+ dataset1_name,
319
+ dataset1_split,
320
  dataset1_text_column,
321
+ dataset2_name,
322
+ dataset2_split,
323
  dataset2_text_column,
324
  threshold
325
  ],
 
329
  demo.launch()
330
 
331
 
332
+
333
  # import gradio as gr
334
  # from datasets import load_dataset
335
  # import numpy as np