Updated app with code for deduplication
Browse files
app.py
CHANGED
@@ -26,15 +26,15 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
|
|
26 |
"""
|
27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
28 |
"""
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
p.update(1)
|
33 |
|
34 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
35 |
duplicate_to_original_mapping = {}
|
36 |
|
37 |
# Finding nearest neighbors
|
|
|
38 |
results = reach.nearest_neighbor_threshold(
|
39 |
embedding_matrix,
|
40 |
threshold=threshold,
|
@@ -61,15 +61,15 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
|
|
61 |
"""
|
62 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
63 |
"""
|
64 |
-
#
|
65 |
-
|
66 |
-
|
67 |
-
p.update(1)
|
68 |
|
69 |
duplicate_indices_in_test = []
|
70 |
duplicate_to_original_mapping = {}
|
71 |
|
72 |
# Finding nearest neighbors between datasets
|
|
|
73 |
results = reach.nearest_neighbor_threshold(
|
74 |
embedding_matrix_2,
|
75 |
threshold=threshold,
|
@@ -117,15 +117,18 @@ def perform_deduplication(
|
|
117 |
|
118 |
if deduplication_type == "Single dataset":
|
119 |
# Load Dataset 1
|
|
|
120 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
121 |
ds = ds_default1
|
122 |
else:
|
123 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
124 |
|
125 |
# Extract texts
|
|
|
126 |
texts = [example[dataset1_text_column] for example in ds]
|
127 |
|
128 |
# Compute embeddings
|
|
|
129 |
embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
|
130 |
|
131 |
# Deduplicate
|
@@ -137,27 +140,33 @@ def perform_deduplication(
|
|
137 |
|
138 |
elif deduplication_type == "Cross-dataset":
|
139 |
# Load Dataset 1
|
|
|
140 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
141 |
ds1 = ds_default1
|
142 |
else:
|
143 |
ds1 = load_dataset(dataset1_name, split=dataset1_split)
|
144 |
|
145 |
# Load Dataset 2
|
|
|
146 |
if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
|
147 |
ds2 = ds_default2
|
148 |
else:
|
149 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
150 |
|
151 |
# Extract texts from Dataset 1
|
|
|
152 |
texts1 = [example[dataset1_text_column] for example in ds1]
|
153 |
|
154 |
# Extract texts from Dataset 2
|
|
|
155 |
texts2 = [example[dataset2_text_column] for example in ds2]
|
156 |
|
157 |
# Compute embeddings for Dataset 1
|
|
|
158 |
embedding_matrix1 = model.encode(texts1, show_progressbar=True)
|
159 |
|
160 |
# Compute embeddings for Dataset 2
|
|
|
161 |
embedding_matrix2 = model.encode(texts2, show_progressbar=True)
|
162 |
|
163 |
# Deduplicate across datasets
|
@@ -308,6 +317,7 @@ demo.launch()
|
|
308 |
|
309 |
|
310 |
|
|
|
311 |
# import gradio as gr
|
312 |
# from datasets import load_dataset
|
313 |
# import numpy as np
|
|
|
26 |
"""
|
27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
28 |
"""
|
29 |
+
# Update progress to indicate building the index
|
30 |
+
progress(0, desc="Building search index...")
|
31 |
+
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
|
|
32 |
|
33 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
34 |
duplicate_to_original_mapping = {}
|
35 |
|
36 |
# Finding nearest neighbors
|
37 |
+
progress(0, desc="Finding nearest neighbors...")
|
38 |
results = reach.nearest_neighbor_threshold(
|
39 |
embedding_matrix,
|
40 |
threshold=threshold,
|
|
|
61 |
"""
|
62 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
63 |
"""
|
64 |
+
# Update progress to indicate building the index
|
65 |
+
progress(0, desc="Building search index from Dataset 1...")
|
66 |
+
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
|
|
67 |
|
68 |
duplicate_indices_in_test = []
|
69 |
duplicate_to_original_mapping = {}
|
70 |
|
71 |
# Finding nearest neighbors between datasets
|
72 |
+
progress(0, desc="Finding nearest neighbors between datasets...")
|
73 |
results = reach.nearest_neighbor_threshold(
|
74 |
embedding_matrix_2,
|
75 |
threshold=threshold,
|
|
|
117 |
|
118 |
if deduplication_type == "Single dataset":
|
119 |
# Load Dataset 1
|
120 |
+
progress(0, desc="Loading Dataset 1...")
|
121 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
122 |
ds = ds_default1
|
123 |
else:
|
124 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
125 |
|
126 |
# Extract texts
|
127 |
+
progress(0, desc="Extracting texts from Dataset 1...")
|
128 |
texts = [example[dataset1_text_column] for example in ds]
|
129 |
|
130 |
# Compute embeddings
|
131 |
+
progress(0, desc="Computing embeddings for Dataset 1...")
|
132 |
embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
|
133 |
|
134 |
# Deduplicate
|
|
|
140 |
|
141 |
elif deduplication_type == "Cross-dataset":
|
142 |
# Load Dataset 1
|
143 |
+
progress(0, desc="Loading Dataset 1...")
|
144 |
if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
|
145 |
ds1 = ds_default1
|
146 |
else:
|
147 |
ds1 = load_dataset(dataset1_name, split=dataset1_split)
|
148 |
|
149 |
# Load Dataset 2
|
150 |
+
progress(0, desc="Loading Dataset 2...")
|
151 |
if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
|
152 |
ds2 = ds_default2
|
153 |
else:
|
154 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
155 |
|
156 |
# Extract texts from Dataset 1
|
157 |
+
progress(0, desc="Extracting texts from Dataset 1...")
|
158 |
texts1 = [example[dataset1_text_column] for example in ds1]
|
159 |
|
160 |
# Extract texts from Dataset 2
|
161 |
+
progress(0, desc="Extracting texts from Dataset 2...")
|
162 |
texts2 = [example[dataset2_text_column] for example in ds2]
|
163 |
|
164 |
# Compute embeddings for Dataset 1
|
165 |
+
progress(0, desc="Computing embeddings for Dataset 1...")
|
166 |
embedding_matrix1 = model.encode(texts1, show_progressbar=True)
|
167 |
|
168 |
# Compute embeddings for Dataset 2
|
169 |
+
progress(0, desc="Computing embeddings for Dataset 2...")
|
170 |
embedding_matrix2 = model.encode(texts2, show_progressbar=True)
|
171 |
|
172 |
# Deduplicate across datasets
|
|
|
317 |
|
318 |
|
319 |
|
320 |
+
|
321 |
# import gradio as gr
|
322 |
# from datasets import load_dataset
|
323 |
# import numpy as np
|