Updated app with code for deduplication
Browse files
app.py
CHANGED
@@ -26,27 +26,24 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
|
|
26 |
"""
|
27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
28 |
"""
|
29 |
-
#
|
30 |
-
progress.tqdm
|
31 |
-
with progress.tqdm(total=1, desc="Building index") as p:
|
32 |
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
33 |
p.update(1)
|
34 |
|
35 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
36 |
duplicate_to_original_mapping = {}
|
37 |
|
38 |
-
#
|
39 |
-
progress.tqdm.write("Finding nearest neighbors...")
|
40 |
results = reach.nearest_neighbor_threshold(
|
41 |
embedding_matrix,
|
42 |
threshold=threshold,
|
43 |
batch_size=batch_size,
|
44 |
-
show_progressbar=
|
45 |
)
|
46 |
|
47 |
-
total_items = len(embedding_matrix)
|
48 |
# Processing duplicates with a progress bar
|
49 |
-
|
50 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
|
51 |
if i not in deduplicated_indices:
|
52 |
continue
|
@@ -64,27 +61,24 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
|
|
64 |
"""
|
65 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
66 |
"""
|
67 |
-
#
|
68 |
-
progress.tqdm
|
69 |
-
with progress.tqdm(total=1, desc="Building index for Dataset 1") as p:
|
70 |
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
71 |
p.update(1)
|
72 |
|
73 |
duplicate_indices_in_test = []
|
74 |
duplicate_to_original_mapping = {}
|
75 |
|
76 |
-
#
|
77 |
-
progress.tqdm.write("Finding nearest neighbors between datasets...")
|
78 |
results = reach.nearest_neighbor_threshold(
|
79 |
embedding_matrix_2,
|
80 |
threshold=threshold,
|
81 |
batch_size=batch_size,
|
82 |
-
show_progressbar=
|
83 |
)
|
84 |
|
85 |
total_items = len(embedding_matrix_2)
|
86 |
# Processing duplicates with a progress bar
|
87 |
-
progress.tqdm.write("Processing duplicates across datasets...")
|
88 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
|
89 |
similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
|
90 |
|
@@ -128,14 +122,11 @@ def perform_deduplication(
|
|
128 |
else:
|
129 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
130 |
|
131 |
-
# Extract texts
|
132 |
-
|
133 |
-
texts = [example[dataset1_text_column] for example in progress.tqdm(ds, desc="Extracting texts", total=len(ds))]
|
134 |
|
135 |
-
# Compute embeddings
|
136 |
-
|
137 |
-
embedding_matrix = model.encode(texts, show_progressbar=False) # Disable internal progress bar
|
138 |
-
embedding_matrix = progress.tqdm(embedding_matrix, desc="Computing embeddings", total=len(texts))
|
139 |
|
140 |
# Deduplicate
|
141 |
result_text = deduplicate_and_prepare_results_single(
|
@@ -158,22 +149,16 @@ def perform_deduplication(
|
|
158 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
159 |
|
160 |
# Extract texts from Dataset 1
|
161 |
-
|
162 |
-
texts1 = [example[dataset1_text_column] for example in progress.tqdm(ds1, desc="Extracting texts from Dataset 1", total=len(ds1))]
|
163 |
|
164 |
# Extract texts from Dataset 2
|
165 |
-
|
166 |
-
texts2 = [example[dataset2_text_column] for example in progress.tqdm(ds2, desc="Extracting texts from Dataset 2", total=len(ds2))]
|
167 |
|
168 |
# Compute embeddings for Dataset 1
|
169 |
-
|
170 |
-
embedding_matrix1 = model.encode(texts1, show_progressbar=False)
|
171 |
-
embedding_matrix1 = progress.tqdm(embedding_matrix1, desc="Computing embeddings for Dataset 1", total=len(texts1))
|
172 |
|
173 |
# Compute embeddings for Dataset 2
|
174 |
-
|
175 |
-
embedding_matrix2 = model.encode(texts2, show_progressbar=False)
|
176 |
-
embedding_matrix2 = progress.tqdm(embedding_matrix2, desc="Computing embeddings for Dataset 2", total=len(texts2))
|
177 |
|
178 |
# Deduplicate across datasets
|
179 |
result_text = deduplicate_and_prepare_results_cross(
|
@@ -322,6 +307,7 @@ with gr.Blocks() as demo:
|
|
322 |
demo.launch()
|
323 |
|
324 |
|
|
|
325 |
# import gradio as gr
|
326 |
# from datasets import load_dataset
|
327 |
# import numpy as np
|
|
|
26 |
"""
|
27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
28 |
"""
|
29 |
+
# Building the index with a progress bar
|
30 |
+
with progress.tqdm(total=1, desc="Building search index") as p:
|
|
|
31 |
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
32 |
p.update(1)
|
33 |
|
34 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
35 |
duplicate_to_original_mapping = {}
|
36 |
|
37 |
+
# Finding nearest neighbors
|
|
|
38 |
results = reach.nearest_neighbor_threshold(
|
39 |
embedding_matrix,
|
40 |
threshold=threshold,
|
41 |
batch_size=batch_size,
|
42 |
+
show_progressbar=True # Allow internal progress bar
|
43 |
)
|
44 |
|
|
|
45 |
# Processing duplicates with a progress bar
|
46 |
+
total_items = len(embedding_matrix)
|
47 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
|
48 |
if i not in deduplicated_indices:
|
49 |
continue
|
|
|
61 |
"""
|
62 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
63 |
"""
|
64 |
+
# Building the index from Dataset 1
|
65 |
+
with progress.tqdm(total=1, desc="Building search index from Dataset 1") as p:
|
|
|
66 |
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
67 |
p.update(1)
|
68 |
|
69 |
duplicate_indices_in_test = []
|
70 |
duplicate_to_original_mapping = {}
|
71 |
|
72 |
+
# Finding nearest neighbors between datasets
|
|
|
73 |
results = reach.nearest_neighbor_threshold(
|
74 |
embedding_matrix_2,
|
75 |
threshold=threshold,
|
76 |
batch_size=batch_size,
|
77 |
+
show_progressbar=True # Allow internal progress bar
|
78 |
)
|
79 |
|
80 |
total_items = len(embedding_matrix_2)
|
81 |
# Processing duplicates with a progress bar
|
|
|
82 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
|
83 |
similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
|
84 |
|
|
|
122 |
else:
|
123 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
124 |
|
125 |
+
# Extract texts
|
126 |
+
texts = [example[dataset1_text_column] for example in ds]
|
|
|
127 |
|
128 |
+
# Compute embeddings
|
129 |
+
embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
|
|
|
|
|
130 |
|
131 |
# Deduplicate
|
132 |
result_text = deduplicate_and_prepare_results_single(
|
|
|
149 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
150 |
|
151 |
# Extract texts from Dataset 1
|
152 |
+
texts1 = [example[dataset1_text_column] for example in ds1]
|
|
|
153 |
|
154 |
# Extract texts from Dataset 2
|
155 |
+
texts2 = [example[dataset2_text_column] for example in ds2]
|
|
|
156 |
|
157 |
# Compute embeddings for Dataset 1
|
158 |
+
embedding_matrix1 = model.encode(texts1, show_progressbar=True)
|
|
|
|
|
159 |
|
160 |
# Compute embeddings for Dataset 2
|
161 |
+
embedding_matrix2 = model.encode(texts2, show_progressbar=True)
|
|
|
|
|
162 |
|
163 |
# Deduplicate across datasets
|
164 |
result_text = deduplicate_and_prepare_results_cross(
|
|
|
307 |
demo.launch()
|
308 |
|
309 |
|
310 |
+
|
311 |
# import gradio as gr
|
312 |
# from datasets import load_dataset
|
313 |
# import numpy as np
|