Pringled commited on
Commit
ed5b7bd
·
1 Parent(s): d54c792
Files changed (1) hide show
  1. app.py +49 -58
app.py CHANGED
@@ -21,7 +21,16 @@ def deduplicate_embeddings(
21
  batch_size: int = 1024,
22
  progress=None
23
  ) -> tuple[np.ndarray, dict[int, int]]:
24
- """Deduplicate embeddings within one dataset or across two datasets."""
 
 
 
 
 
 
 
 
 
25
  if embeddings_b is None:
26
  reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
27
  duplicate_to_original = {}
@@ -49,13 +58,27 @@ def deduplicate_embeddings(
49
  return duplicate_indices_in_b, duplicate_to_original
50
 
51
  def display_word_differences(x: str, y: str) -> str:
52
- """Display word-level differences between two texts, avoiding Markdown issues."""
 
 
 
 
 
 
 
53
  diff = ndiff(x.split(), y.split())
54
  formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
55
  return f"```\n{formatted_diff}\n```"
56
 
57
  def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
58
- """Load texts from a specified dataset and split."""
 
 
 
 
 
 
 
59
  ds = load_dataset(dataset_name, split=dataset_split)
60
  return [example[text_column] for example in ds]
61
 
@@ -70,7 +93,20 @@ def perform_deduplication(
70
  threshold: float = default_threshold,
71
  progress: gr.Progress = gr.Progress(track_tqdm=True)
72
  ):
73
- """Perform deduplication on one or two datasets."""
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  try:
75
  threshold = float(threshold)
76
 
@@ -209,6 +245,8 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
209
 
210
  demo.launch()
211
 
 
 
212
  # import gradio as gr
213
  # from datasets import load_dataset
214
  # import numpy as np
@@ -232,16 +270,7 @@ demo.launch()
232
  # batch_size: int = 1024,
233
  # progress=None
234
  # ) -> tuple[np.ndarray, dict[int, int]]:
235
- # """
236
- # Deduplicate embeddings within one dataset or across two datasets.
237
-
238
- # :param embeddings_a: Embeddings of Dataset 1.
239
- # :param embeddings_b: Optional, embeddings of Dataset 2.
240
- # :param threshold: Similarity threshold for deduplication.
241
- # :param batch_size: Batch size for similarity computation.
242
- # :param progress: Gradio progress tracker for feedback.
243
- # :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
244
- # """
245
  # if embeddings_b is None:
246
  # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
247
  # duplicate_to_original = {}
@@ -269,39 +298,13 @@ demo.launch()
269
  # return duplicate_indices_in_b, duplicate_to_original
270
 
271
  # def display_word_differences(x: str, y: str) -> str:
272
- # """
273
- # Display the word-level differences between two texts, formatted to avoid
274
- # misinterpretation of Markdown syntax.
275
-
276
- # :param x: First text.
277
- # :param y: Second text.
278
- # :return: A string showing word-level differences, wrapped in a code block.
279
- # """
280
  # diff = ndiff(x.split(), y.split())
281
- # # Wrap differences in a code block to prevent interpretation as Markdown
282
  # formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
283
  # return f"```\n{formatted_diff}\n```"
284
 
285
- # # def display_word_differences(x: str, y: str) -> str:
286
- # # """
287
- # # Display the word-level differences between two texts.
288
-
289
- # # :param x: First text.
290
- # # :param y: Second text.
291
- # # :return: A string showing word-level differences.
292
- # # """
293
- # # diff = ndiff(x.split(), y.split())
294
- # # return " ".join(word for word in diff if word.startswith(("+", "-")))
295
-
296
  # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
297
- # """
298
- # Load texts from a specified dataset and split.
299
-
300
- # :param dataset_name: Name of the dataset.
301
- # :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
302
- # :param text_column: Name of the text column.
303
- # :return: A list of texts from the dataset.
304
- # """
305
  # ds = load_dataset(dataset_name, split=dataset_split)
306
  # return [example[text_column] for example in ds]
307
 
@@ -316,20 +319,7 @@ demo.launch()
316
  # threshold: float = default_threshold,
317
  # progress: gr.Progress = gr.Progress(track_tqdm=True)
318
  # ):
319
- # """
320
- # Perform deduplication on one or two datasets based on the deduplication type.
321
-
322
- # :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
323
- # :param dataset1_name: Name of the first dataset.
324
- # :param dataset1_split: Split of the first dataset.
325
- # :param dataset1_text_column: Text column of the first dataset.
326
- # :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
327
- # :param dataset2_split: Optional, split of the second dataset.
328
- # :param dataset2_text_column: Optional, text column of the second dataset.
329
- # :param threshold: Similarity threshold for deduplication.
330
- # :param progress: Gradio progress tracker.
331
- # :return: Status updates and result text for the Gradio interface.
332
- # """
333
  # try:
334
  # threshold = float(threshold)
335
 
@@ -411,6 +401,7 @@ demo.launch()
411
  # yield f"An error occurred: {e}", ""
412
  # raise e
413
 
 
414
  # with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
415
  # gr.Markdown("# Semantic Deduplication")
416
  # gr.Markdown("""
@@ -440,7 +431,7 @@ demo.launch()
440
  # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
441
 
442
  # threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
443
- # compute_button = gr.Button("Compute")
444
  # status_output = gr.Markdown(elem_id="status_output")
445
  # result_output = gr.Markdown()
446
 
@@ -464,5 +455,5 @@ demo.launch()
464
  # outputs=[status_output, result_output],
465
  # )
466
 
467
- # demo.launch()
468
 
 
 
21
  batch_size: int = 1024,
22
  progress=None
23
  ) -> tuple[np.ndarray, dict[int, int]]:
24
+ """
25
+ Deduplicate embeddings within one dataset or across two datasets.
26
+
27
+ :param embeddings_a: Embeddings of Dataset 1.
28
+ :param embeddings_b: Optional, embeddings of Dataset 2.
29
+ :param threshold: Similarity threshold for deduplication.
30
+ :param batch_size: Batch size for similarity computation.
31
+ :param progress: Gradio progress tracker for feedback.
32
+ :return: Deduplicated indices and a mapping of removed indices to their original counterparts.
33
+ """
34
  if embeddings_b is None:
35
  reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
36
  duplicate_to_original = {}
 
58
  return duplicate_indices_in_b, duplicate_to_original
59
 
60
  def display_word_differences(x: str, y: str) -> str:
61
+ """
62
+ Display the word-level differences between two texts, formatted to avoid
63
+ misinterpretation of Markdown syntax.
64
+
65
+ :param x: First text.
66
+ :param y: Second text.
67
+ :return: A string showing word-level differences, wrapped in a code block.
68
+ """
69
  diff = ndiff(x.split(), y.split())
70
  formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
71
  return f"```\n{formatted_diff}\n```"
72
 
73
  def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
74
+ """
75
+ Load texts from a specified dataset and split.
76
+
77
+ :param dataset_name: Name of the dataset.
78
+ :param dataset_split: Split of the dataset (e.g., 'train', 'validation').
79
+ :param text_column: Name of the text column.
80
+ :return: A list of texts from the dataset.
81
+ """
82
  ds = load_dataset(dataset_name, split=dataset_split)
83
  return [example[text_column] for example in ds]
84
 
 
93
  threshold: float = default_threshold,
94
  progress: gr.Progress = gr.Progress(track_tqdm=True)
95
  ):
96
+ """
97
+ Perform deduplication on one or two datasets based on the deduplication type.
98
+
99
+ :param deduplication_type: 'Single dataset' or 'Cross-dataset'.
100
+ :param dataset1_name: Name of the first dataset.
101
+ :param dataset1_split: Split of the first dataset.
102
+ :param dataset1_text_column: Text column of the first dataset.
103
+ :param dataset2_name: Optional, name of the second dataset (for cross-dataset deduplication).
104
+ :param dataset2_split: Optional, split of the second dataset.
105
+ :param dataset2_text_column: Optional, text column of the second dataset.
106
+ :param threshold: Similarity threshold for deduplication.
107
+ :param progress: Gradio progress tracker.
108
+ :return: Status updates and result text for the Gradio interface.
109
+ """
110
  try:
111
  threshold = float(threshold)
112
 
 
245
 
246
  demo.launch()
247
 
248
+
249
+
250
  # import gradio as gr
251
  # from datasets import load_dataset
252
  # import numpy as np
 
270
  # batch_size: int = 1024,
271
  # progress=None
272
  # ) -> tuple[np.ndarray, dict[int, int]]:
273
+ # """Deduplicate embeddings within one dataset or across two datasets."""
 
 
 
 
 
 
 
 
 
274
  # if embeddings_b is None:
275
  # reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
276
  # duplicate_to_original = {}
 
298
  # return duplicate_indices_in_b, duplicate_to_original
299
 
300
  # def display_word_differences(x: str, y: str) -> str:
301
+ # """Display word-level differences between two texts, avoiding Markdown issues."""
 
 
 
 
 
 
 
302
  # diff = ndiff(x.split(), y.split())
 
303
  # formatted_diff = "\n".join(word for word in diff if word.startswith(("+", "-")))
304
  # return f"```\n{formatted_diff}\n```"
305
 
 
 
 
 
 
 
 
 
 
 
 
306
  # def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
307
+ # """Load texts from a specified dataset and split."""
 
 
 
 
 
 
 
308
  # ds = load_dataset(dataset_name, split=dataset_split)
309
  # return [example[text_column] for example in ds]
310
 
 
319
  # threshold: float = default_threshold,
320
  # progress: gr.Progress = gr.Progress(track_tqdm=True)
321
  # ):
322
+ # """Perform deduplication on one or two datasets."""
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  # try:
324
  # threshold = float(threshold)
325
 
 
401
  # yield f"An error occurred: {e}", ""
402
  # raise e
403
 
404
+ # # Gradio app with stop button support
405
  # with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
406
  # gr.Markdown("# Semantic Deduplication")
407
  # gr.Markdown("""
 
431
  # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
432
 
433
  # threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
434
+ # compute_button = gr.Button("Deduplicate")
435
  # status_output = gr.Markdown(elem_id="status_output")
436
  # result_output = gr.Markdown()
437
 
 
455
  # outputs=[status_output, result_output],
456
  # )
457
 
 
458
 
459
+ # demo.launch()