Pringled commited on
Commit
504b6fc
·
1 Parent(s): 4b1ac5a

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +18 -8
app.py CHANGED
@@ -26,15 +26,15 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
26
  """
27
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
28
  """
29
- # Building the index with a progress bar
30
- with progress.tqdm(total=1, desc="Building search index") as p:
31
- reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
32
- p.update(1)
33
 
34
  deduplicated_indices = set(range(len(embedding_matrix)))
35
  duplicate_to_original_mapping = {}
36
 
37
  # Finding nearest neighbors
 
38
  results = reach.nearest_neighbor_threshold(
39
  embedding_matrix,
40
  threshold=threshold,
@@ -61,15 +61,15 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
61
  """
62
  Deduplicate embeddings across two datasets and return the indices of duplicates between them.
63
  """
64
- # Building the index from Dataset 1
65
- with progress.tqdm(total=1, desc="Building search index from Dataset 1") as p:
66
- reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
67
- p.update(1)
68
 
69
  duplicate_indices_in_test = []
70
  duplicate_to_original_mapping = {}
71
 
72
  # Finding nearest neighbors between datasets
 
73
  results = reach.nearest_neighbor_threshold(
74
  embedding_matrix_2,
75
  threshold=threshold,
@@ -117,15 +117,18 @@ def perform_deduplication(
117
 
118
  if deduplication_type == "Single dataset":
119
  # Load Dataset 1
 
120
  if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
121
  ds = ds_default1
122
  else:
123
  ds = load_dataset(dataset1_name, split=dataset1_split)
124
 
125
  # Extract texts
 
126
  texts = [example[dataset1_text_column] for example in ds]
127
 
128
  # Compute embeddings
 
129
  embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
130
 
131
  # Deduplicate
@@ -137,27 +140,33 @@ def perform_deduplication(
137
 
138
  elif deduplication_type == "Cross-dataset":
139
  # Load Dataset 1
 
140
  if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
141
  ds1 = ds_default1
142
  else:
143
  ds1 = load_dataset(dataset1_name, split=dataset1_split)
144
 
145
  # Load Dataset 2
 
146
  if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
147
  ds2 = ds_default2
148
  else:
149
  ds2 = load_dataset(dataset2_name, split=dataset2_split)
150
 
151
  # Extract texts from Dataset 1
 
152
  texts1 = [example[dataset1_text_column] for example in ds1]
153
 
154
  # Extract texts from Dataset 2
 
155
  texts2 = [example[dataset2_text_column] for example in ds2]
156
 
157
  # Compute embeddings for Dataset 1
 
158
  embedding_matrix1 = model.encode(texts1, show_progressbar=True)
159
 
160
  # Compute embeddings for Dataset 2
 
161
  embedding_matrix2 = model.encode(texts2, show_progressbar=True)
162
 
163
  # Deduplicate across datasets
@@ -308,6 +317,7 @@ demo.launch()
308
 
309
 
310
 
 
311
  # import gradio as gr
312
  # from datasets import load_dataset
313
  # import numpy as np
 
26
  """
27
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
28
  """
29
+ # Update progress to indicate building the index
30
+ progress(0, desc="Building search index...")
31
+ reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 
32
 
33
  deduplicated_indices = set(range(len(embedding_matrix)))
34
  duplicate_to_original_mapping = {}
35
 
36
  # Finding nearest neighbors
37
+ progress(0, desc="Finding nearest neighbors...")
38
  results = reach.nearest_neighbor_threshold(
39
  embedding_matrix,
40
  threshold=threshold,
 
61
  """
62
  Deduplicate embeddings across two datasets and return the indices of duplicates between them.
63
  """
64
+ # Update progress to indicate building the index
65
+ progress(0, desc="Building search index from Dataset 1...")
66
+ reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 
67
 
68
  duplicate_indices_in_test = []
69
  duplicate_to_original_mapping = {}
70
 
71
  # Finding nearest neighbors between datasets
72
+ progress(0, desc="Finding nearest neighbors between datasets...")
73
  results = reach.nearest_neighbor_threshold(
74
  embedding_matrix_2,
75
  threshold=threshold,
 
117
 
118
  if deduplication_type == "Single dataset":
119
  # Load Dataset 1
120
+ progress(0, desc="Loading Dataset 1...")
121
  if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
122
  ds = ds_default1
123
  else:
124
  ds = load_dataset(dataset1_name, split=dataset1_split)
125
 
126
  # Extract texts
127
+ progress(0, desc="Extracting texts from Dataset 1...")
128
  texts = [example[dataset1_text_column] for example in ds]
129
 
130
  # Compute embeddings
131
+ progress(0, desc="Computing embeddings for Dataset 1...")
132
  embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
133
 
134
  # Deduplicate
 
140
 
141
  elif deduplication_type == "Cross-dataset":
142
  # Load Dataset 1
143
+ progress(0, desc="Loading Dataset 1...")
144
  if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
145
  ds1 = ds_default1
146
  else:
147
  ds1 = load_dataset(dataset1_name, split=dataset1_split)
148
 
149
  # Load Dataset 2
150
+ progress(0, desc="Loading Dataset 2...")
151
  if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
152
  ds2 = ds_default2
153
  else:
154
  ds2 = load_dataset(dataset2_name, split=dataset2_split)
155
 
156
  # Extract texts from Dataset 1
157
+ progress(0, desc="Extracting texts from Dataset 1...")
158
  texts1 = [example[dataset1_text_column] for example in ds1]
159
 
160
  # Extract texts from Dataset 2
161
+ progress(0, desc="Extracting texts from Dataset 2...")
162
  texts2 = [example[dataset2_text_column] for example in ds2]
163
 
164
  # Compute embeddings for Dataset 1
165
+ progress(0, desc="Computing embeddings for Dataset 1...")
166
  embedding_matrix1 = model.encode(texts1, show_progressbar=True)
167
 
168
  # Compute embeddings for Dataset 2
169
+ progress(0, desc="Computing embeddings for Dataset 2...")
170
  embedding_matrix2 = model.encode(texts2, show_progressbar=True)
171
 
172
  # Deduplicate across datasets
 
317
 
318
 
319
 
320
+
321
  # import gradio as gr
322
  # from datasets import load_dataset
323
  # import numpy as np