Pringled commited on
Commit
4b1ac5a
·
1 Parent(s): 3b4c438

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +18 -32
app.py CHANGED
@@ -26,27 +26,24 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
26
  """
27
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
28
  """
29
- # Informative progress bar for building the index
30
- progress.tqdm.write("Building search index...")
31
- with progress.tqdm(total=1, desc="Building index") as p:
32
  reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
33
  p.update(1)
34
 
35
  deduplicated_indices = set(range(len(embedding_matrix)))
36
  duplicate_to_original_mapping = {}
37
 
38
- # Informative progress bar for nearest neighbor search
39
- progress.tqdm.write("Finding nearest neighbors...")
40
  results = reach.nearest_neighbor_threshold(
41
  embedding_matrix,
42
  threshold=threshold,
43
  batch_size=batch_size,
44
- show_progressbar=False # Disable internal progress bar
45
  )
46
 
47
- total_items = len(embedding_matrix)
48
  # Processing duplicates with a progress bar
49
- progress.tqdm.write("Processing duplicates...")
50
  for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
51
  if i not in deduplicated_indices:
52
  continue
@@ -64,27 +61,24 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
64
  """
65
  Deduplicate embeddings across two datasets and return the indices of duplicates between them.
66
  """
67
- # Informative progress bar for building the index
68
- progress.tqdm.write("Building search index from Dataset 1...")
69
- with progress.tqdm(total=1, desc="Building index for Dataset 1") as p:
70
  reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
71
  p.update(1)
72
 
73
  duplicate_indices_in_test = []
74
  duplicate_to_original_mapping = {}
75
 
76
- # Informative progress bar for nearest neighbor search
77
- progress.tqdm.write("Finding nearest neighbors between datasets...")
78
  results = reach.nearest_neighbor_threshold(
79
  embedding_matrix_2,
80
  threshold=threshold,
81
  batch_size=batch_size,
82
- show_progressbar=False # Disable internal progress bar
83
  )
84
 
85
  total_items = len(embedding_matrix_2)
86
  # Processing duplicates with a progress bar
87
- progress.tqdm.write("Processing duplicates across datasets...")
88
  for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
89
  similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
90
 
@@ -128,14 +122,11 @@ def perform_deduplication(
128
  else:
129
  ds = load_dataset(dataset1_name, split=dataset1_split)
130
 
131
- # Extract texts with progress bar
132
- progress.tqdm.write("Extracting texts from Dataset 1...")
133
- texts = [example[dataset1_text_column] for example in progress.tqdm(ds, desc="Extracting texts", total=len(ds))]
134
 
135
- # Compute embeddings with progress bar
136
- progress.tqdm.write("Computing embeddings for Dataset 1...")
137
- embedding_matrix = model.encode(texts, show_progressbar=False) # Disable internal progress bar
138
- embedding_matrix = progress.tqdm(embedding_matrix, desc="Computing embeddings", total=len(texts))
139
 
140
  # Deduplicate
141
  result_text = deduplicate_and_prepare_results_single(
@@ -158,22 +149,16 @@ def perform_deduplication(
158
  ds2 = load_dataset(dataset2_name, split=dataset2_split)
159
 
160
  # Extract texts from Dataset 1
161
- progress.tqdm.write("Extracting texts from Dataset 1...")
162
- texts1 = [example[dataset1_text_column] for example in progress.tqdm(ds1, desc="Extracting texts from Dataset 1", total=len(ds1))]
163
 
164
  # Extract texts from Dataset 2
165
- progress.tqdm.write("Extracting texts from Dataset 2...")
166
- texts2 = [example[dataset2_text_column] for example in progress.tqdm(ds2, desc="Extracting texts from Dataset 2", total=len(ds2))]
167
 
168
  # Compute embeddings for Dataset 1
169
- progress.tqdm.write("Computing embeddings for Dataset 1...")
170
- embedding_matrix1 = model.encode(texts1, show_progressbar=False)
171
- embedding_matrix1 = progress.tqdm(embedding_matrix1, desc="Computing embeddings for Dataset 1", total=len(texts1))
172
 
173
  # Compute embeddings for Dataset 2
174
- progress.tqdm.write("Computing embeddings for Dataset 2...")
175
- embedding_matrix2 = model.encode(texts2, show_progressbar=False)
176
- embedding_matrix2 = progress.tqdm(embedding_matrix2, desc="Computing embeddings for Dataset 2", total=len(texts2))
177
 
178
  # Deduplicate across datasets
179
  result_text = deduplicate_and_prepare_results_cross(
@@ -322,6 +307,7 @@ with gr.Blocks() as demo:
322
  demo.launch()
323
 
324
 
 
325
  # import gradio as gr
326
  # from datasets import load_dataset
327
  # import numpy as np
 
26
  """
27
  Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
28
  """
29
+ # Building the index with a progress bar
30
+ with progress.tqdm(total=1, desc="Building search index") as p:
 
31
  reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
32
  p.update(1)
33
 
34
  deduplicated_indices = set(range(len(embedding_matrix)))
35
  duplicate_to_original_mapping = {}
36
 
37
+ # Finding nearest neighbors
 
38
  results = reach.nearest_neighbor_threshold(
39
  embedding_matrix,
40
  threshold=threshold,
41
  batch_size=batch_size,
42
+ show_progressbar=True # Allow internal progress bar
43
  )
44
 
 
45
  # Processing duplicates with a progress bar
46
+ total_items = len(embedding_matrix)
47
  for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
48
  if i not in deduplicated_indices:
49
  continue
 
61
  """
62
  Deduplicate embeddings across two datasets and return the indices of duplicates between them.
63
  """
64
+ # Building the index from Dataset 1
65
+ with progress.tqdm(total=1, desc="Building search index from Dataset 1") as p:
 
66
  reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
67
  p.update(1)
68
 
69
  duplicate_indices_in_test = []
70
  duplicate_to_original_mapping = {}
71
 
72
+ # Finding nearest neighbors between datasets
 
73
  results = reach.nearest_neighbor_threshold(
74
  embedding_matrix_2,
75
  threshold=threshold,
76
  batch_size=batch_size,
77
+ show_progressbar=True # Allow internal progress bar
78
  )
79
 
80
  total_items = len(embedding_matrix_2)
81
  # Processing duplicates with a progress bar
 
82
  for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
83
  similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
84
 
 
122
  else:
123
  ds = load_dataset(dataset1_name, split=dataset1_split)
124
 
125
+ # Extract texts
126
+ texts = [example[dataset1_text_column] for example in ds]
 
127
 
128
+ # Compute embeddings
129
+ embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
 
 
130
 
131
  # Deduplicate
132
  result_text = deduplicate_and_prepare_results_single(
 
149
  ds2 = load_dataset(dataset2_name, split=dataset2_split)
150
 
151
  # Extract texts from Dataset 1
152
+ texts1 = [example[dataset1_text_column] for example in ds1]
 
153
 
154
  # Extract texts from Dataset 2
155
+ texts2 = [example[dataset2_text_column] for example in ds2]
 
156
 
157
  # Compute embeddings for Dataset 1
158
+ embedding_matrix1 = model.encode(texts1, show_progressbar=True)
 
 
159
 
160
  # Compute embeddings for Dataset 2
161
+ embedding_matrix2 = model.encode(texts2, show_progressbar=True)
 
 
162
 
163
  # Deduplicate across datasets
164
  result_text = deduplicate_and_prepare_results_cross(
 
307
  demo.launch()
308
 
309
 
310
+
311
  # import gradio as gr
312
  # from datasets import load_dataset
313
  # import numpy as np