Updates
Browse files
app.py
CHANGED
@@ -21,7 +21,16 @@ def deduplicate_embeddings(
|
|
21 |
batch_size: int = 1024,
|
22 |
progress=None
|
23 |
) -> tuple[np.ndarray, dict[int, int]]:
|
24 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
if embeddings_b is None:
|
26 |
reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
27 |
duplicate_to_original = {}
|
@@ -49,13 +58,27 @@ def deduplicate_embeddings(
|
|
49 |
return duplicate_indices_in_b, duplicate_to_original
|
50 |
|
51 |
def display_word_differences(x: str, y: str) -> str:
|
52 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
diff = ndiff(x.split(), y.split())
|
54 |
formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
55 |
return f"```\n{formatted_diff}\n```"
|
56 |
|
57 |
def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
58 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
ds = load_dataset(dataset_name, split=dataset_split)
|
60 |
return [example[text_column] for example in ds]
|
61 |
|
@@ -70,7 +93,20 @@ def perform_deduplication(
|
|
70 |
threshold: float = default_threshold,
|
71 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
72 |
):
|
73 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
try:
|
75 |
threshold = float(threshold)
|
76 |
|
@@ -209,6 +245,8 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
|
209 |
|
210 |
demo.launch()
|
211 |
|
|
|
|
|
212 |
# import gradio as gr
|
213 |
# from datasets import load_dataset
|
214 |
# import numpy as np
|
@@ -232,16 +270,7 @@ demo.launch()
|
|
232 |
# batch_size: int = 1024,
|
233 |
# progress=None
|
234 |
# ) -> tuple[np.ndarray, dict[int, int]]:
|
235 |
-
# """
|
236 |
-
# Deduplicate embeddings within one dataset or across two datasets.
|
237 |
-
|
238 |
-
# :param embeddings_a: Embeddings of Dataset 1.
|
239 |
-
# :param embeddings_b: Optional, embeddings of Dataset 2.
|
240 |
-
# :param threshold: Similarity threshold for deduplication.
|
241 |
-
# :param batch_size: Batch size for similarity computation.
|
242 |
-
# :param progress: Gradio progress tracker for feedback.
|
243 |
-
# :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
|
244 |
-
# """
|
245 |
# if embeddings_b is None:
|
246 |
# reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
247 |
# duplicate_to_original = {}
|
@@ -269,39 +298,13 @@ demo.launch()
|
|
269 |
# return duplicate_indices_in_b, duplicate_to_original
|
270 |
|
271 |
# def display_word_differences(x: str, y: str) -> str:
|
272 |
-
# """
|
273 |
-
# Display the word-level differences between two texts, formatted to avoid
|
274 |
-
# misinterpretation of Markdown syntax.
|
275 |
-
|
276 |
-
# :param x: First text.
|
277 |
-
# :param y: Second text.
|
278 |
-
# :return: A string showing word-level differences, wrapped in a code block.
|
279 |
-
# """
|
280 |
# diff = ndiff(x.split(), y.split())
|
281 |
-
# # Wrap differences in a code block to prevent interpretation as Markdown
|
282 |
# formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
283 |
# return f"```\n{formatted_diff}\n```"
|
284 |
|
285 |
-
# # def display_word_differences(x: str, y: str) -> str:
|
286 |
-
# # """
|
287 |
-
# # Display the word-level differences between two texts.
|
288 |
-
|
289 |
-
# # :param x: First text.
|
290 |
-
# # :param y: Second text.
|
291 |
-
# # :return: A string showing word-level differences.
|
292 |
-
# # """
|
293 |
-
# # diff = ndiff(x.split(), y.split())
|
294 |
-
# # return " ".join(word for word in diff if word.startswith(("+", "-")))
|
295 |
-
|
296 |
# def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
297 |
-
# """
|
298 |
-
# Load texts from a specified dataset and split.
|
299 |
-
|
300 |
-
# :param dataset_name: Name of the dataset.
|
301 |
-
# :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
|
302 |
-
# :param text_column: Name of the text column.
|
303 |
-
# :return: A list of texts from the dataset.
|
304 |
-
# """
|
305 |
# ds = load_dataset(dataset_name, split=dataset_split)
|
306 |
# return [example[text_column] for example in ds]
|
307 |
|
@@ -316,20 +319,7 @@ demo.launch()
|
|
316 |
# threshold: float = default_threshold,
|
317 |
# progress: gr.Progress = gr.Progress(track_tqdm=True)
|
318 |
# ):
|
319 |
-
# """
|
320 |
-
# Perform deduplication on one or two datasets based on the deduplication type.
|
321 |
-
|
322 |
-
# :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
|
323 |
-
# :param dataset1_name: Name of the first dataset.
|
324 |
-
# :param dataset1_split: Split of the first dataset.
|
325 |
-
# :param dataset1_text_column: Text column of the first dataset.
|
326 |
-
# :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
|
327 |
-
# :param dataset2_split: Optional, split of the second dataset.
|
328 |
-
# :param dataset2_text_column: Optional, text column of the second dataset.
|
329 |
-
# :param threshold: Similarity threshold for deduplication.
|
330 |
-
# :param progress: Gradio progress tracker.
|
331 |
-
# :return: Status updates and result text for the Gradio interface.
|
332 |
-
# """
|
333 |
# try:
|
334 |
# threshold = float(threshold)
|
335 |
|
@@ -411,6 +401,7 @@ demo.launch()
|
|
411 |
# yield f"An error occurred: {e}", ""
|
412 |
# raise e
|
413 |
|
|
|
414 |
# with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
415 |
# gr.Markdown("# Semantic Deduplication")
|
416 |
# gr.Markdown("""
|
@@ -440,7 +431,7 @@ demo.launch()
|
|
440 |
# dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
441 |
|
442 |
# threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
443 |
-
# compute_button = gr.Button("
|
444 |
# status_output = gr.Markdown(elem_id="status_output")
|
445 |
# result_output = gr.Markdown()
|
446 |
|
@@ -464,5 +455,5 @@ demo.launch()
|
|
464 |
# outputs=[status_output, result_output],
|
465 |
# )
|
466 |
|
467 |
-
# demo.launch()
|
468 |
|
|
|
|
21 |
batch_size: int = 1024,
|
22 |
progress=None
|
23 |
) -> tuple[np.ndarray, dict[int, int]]:
|
24 |
+
"""
|
25 |
+
Deduplicate embeddings within one dataset or across two datasets.
|
26 |
+
|
27 |
+
:param embeddings_a: Embeddings of Dataset 1.
|
28 |
+
:param embeddings_b: Optional, embeddings of Dataset 2.
|
29 |
+
:param threshold: Similarity threshold for deduplication.
|
30 |
+
:param batch_size: Batch size for similarity computation.
|
31 |
+
:param progress: Gradio progress tracker for feedback.
|
32 |
+
:return: Deduplicated indices and a mapping of removed indices to their original counterparts.
|
33 |
+
"""
|
34 |
if embeddings_b is None:
|
35 |
reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
36 |
duplicate_to_original = {}
|
|
|
58 |
return duplicate_indices_in_b, duplicate_to_original
|
59 |
|
60 |
def display_word_differences(x: str, y: str) -> str:
|
61 |
+
"""
|
62 |
+
Display the word-level differences between two texts, formatted to avoid
|
63 |
+
misinterpretation of Markdown syntax.
|
64 |
+
|
65 |
+
:param x: First text.
|
66 |
+
:param y: Second text.
|
67 |
+
:return: A string showing word-level differences, wrapped in a code block.
|
68 |
+
"""
|
69 |
diff = ndiff(x.split(), y.split())
|
70 |
formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
71 |
return f"```\n{formatted_diff}\n```"
|
72 |
|
73 |
def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
74 |
+
"""
|
75 |
+
Load texts from a specified dataset and split.
|
76 |
+
|
77 |
+
:param dataset_name: Name of the dataset.
|
78 |
+
:param dataset_split: Split of the dataset (e.g., 'train', 'validation').
|
79 |
+
:param text_column: Name of the text column.
|
80 |
+
:return: A list of texts from the dataset.
|
81 |
+
"""
|
82 |
ds = load_dataset(dataset_name, split=dataset_split)
|
83 |
return [example[text_column] for example in ds]
|
84 |
|
|
|
93 |
threshold: float = default_threshold,
|
94 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
95 |
):
|
96 |
+
"""
|
97 |
+
Perform deduplication on one or two datasets based on the deduplication type.
|
98 |
+
|
99 |
+
:param deduplication_type: 'Single dataset' or 'Cross-dataset'.
|
100 |
+
:param dataset1_name: Name of the first dataset.
|
101 |
+
:param dataset1_split: Split of the first dataset.
|
102 |
+
:param dataset1_text_column: Text column of the first dataset.
|
103 |
+
:param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
|
104 |
+
:param dataset2_split: Optional, split of the second dataset.
|
105 |
+
:param dataset2_text_column: Optional, text column of the second dataset.
|
106 |
+
:param threshold: Similarity threshold for deduplication.
|
107 |
+
:param progress: Gradio progress tracker.
|
108 |
+
:return: Status updates and result text for the Gradio interface.
|
109 |
+
"""
|
110 |
try:
|
111 |
threshold = float(threshold)
|
112 |
|
|
|
245 |
|
246 |
demo.launch()
|
247 |
|
248 |
+
|
249 |
+
|
250 |
# import gradio as gr
|
251 |
# from datasets import load_dataset
|
252 |
# import numpy as np
|
|
|
270 |
# batch_size: int = 1024,
|
271 |
# progress=None
|
272 |
# ) -> tuple[np.ndarray, dict[int, int]]:
|
273 |
+
# """Deduplicate embeddings within one dataset or across two datasets."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
# if embeddings_b is None:
|
275 |
# reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
|
276 |
# duplicate_to_original = {}
|
|
|
298 |
# return duplicate_indices_in_b, duplicate_to_original
|
299 |
|
300 |
# def display_word_differences(x: str, y: str) -> str:
|
301 |
+
# """Display word-level differences between two texts, avoiding Markdown issues."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
# diff = ndiff(x.split(), y.split())
|
|
|
303 |
# formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
|
304 |
# return f"```\n{formatted_diff}\n```"
|
305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
# def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
|
307 |
+
# """Load texts from a specified dataset and split."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
# ds = load_dataset(dataset_name, split=dataset_split)
|
309 |
# return [example[text_column] for example in ds]
|
310 |
|
|
|
319 |
# threshold: float = default_threshold,
|
320 |
# progress: gr.Progress = gr.Progress(track_tqdm=True)
|
321 |
# ):
|
322 |
+
# """Perform deduplication on one or two datasets."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
# try:
|
324 |
# threshold = float(threshold)
|
325 |
|
|
|
401 |
# yield f"An error occurred: {e}", ""
|
402 |
# raise e
|
403 |
|
404 |
+
# # Gradio app with stop button support
|
405 |
# with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
406 |
# gr.Markdown("# Semantic Deduplication")
|
407 |
# gr.Markdown("""
|
|
|
431 |
# dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
432 |
|
433 |
# threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
434 |
+
# compute_button = gr.Button("Deduplicate")
|
435 |
# status_output = gr.Markdown(elem_id="status_output")
|
436 |
# result_output = gr.Markdown()
|
437 |
|
|
|
455 |
# outputs=[status_output, result_output],
|
456 |
# )
|
457 |
|
|
|
458 |
|
459 |
+
# demo.launch()
|