Pringled commited on
Commit
7ed3881
·
1 Parent(s): 892ceeb

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -77,10 +77,10 @@ def perform_deduplication(
77
  dataset1_name,
78
  dataset1_split,
79
  dataset1_text_column,
80
- dataset2_name,
81
- dataset2_split,
82
- dataset2_text_column,
83
- threshold,
84
  progress=gr.Progress(track_tqdm=True)
85
  ):
86
  # Convert threshold to float
@@ -112,7 +112,6 @@ def perform_deduplication(
112
  # Show deduplicated examples
113
  result_text += "**Examples of duplicates found:**\n\n"
114
  num_examples = min(5, num_duplicates)
115
- examples_shown = 0
116
  for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
117
  original_text = texts[original_idx]
118
  duplicate_text = texts[duplicate_idx]
@@ -121,7 +120,6 @@ def perform_deduplication(
121
  result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
122
  result_text += f"**Differences:**\n{differences}\n"
123
  result_text += "-" * 50 + "\n\n"
124
- examples_shown += 1
125
 
126
  return result_text
127
 
@@ -153,7 +151,6 @@ def perform_deduplication(
153
  # Show deduplicated examples
154
  result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
155
  num_examples = min(5, num_duplicates)
156
- examples_shown = 0
157
  for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
158
  original_idx = duplicate_to_original_mapping[duplicate_idx]
159
  original_text = texts1[original_idx]
@@ -163,42 +160,54 @@ def perform_deduplication(
163
  result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
164
  result_text += f"**Differences:**\n{differences}\n"
165
  result_text += "-" * 50 + "\n\n"
166
- examples_shown += 1
167
 
168
  return result_text
169
 
170
  with gr.Blocks() as demo:
171
  gr.Markdown("# Semantic Deduplication")
172
 
173
- deduplication_type = gr.Radio(choices=["Single dataset", "Cross-dataset"], label="Deduplication Type", value="Single dataset")
 
 
 
 
174
 
175
- with gr.Tab("Dataset 1"):
176
- with gr.Row():
177
- dataset1_name = gr.Textbox(value="ag_news", label="Dataset Name")
178
- dataset1_split = gr.Textbox(value="train", label="Split")
179
- dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
180
 
181
- dataset2_tab = gr.Tab("Dataset 2", visible=False)
182
- with dataset2_tab:
 
183
  with gr.Row():
184
- dataset2_name = gr.Textbox(value="ag_news", label="Dataset Name")
185
- dataset2_split = gr.Textbox(value="test", label="Split")
186
  dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
187
 
188
- threshold = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, label="Similarity Threshold")
 
 
 
 
 
189
 
190
  compute_button = gr.Button("Compute")
191
 
192
  output = gr.Markdown()
193
 
194
- # Function to update the visibility of dataset2_tab
195
- def update_visibility(deduplication_type):
196
- if deduplication_type == "Cross-dataset":
197
- return {dataset2_tab: gr.update(visible=True)}
198
  else:
199
- return {dataset2_tab: gr.update(visible=False)}
200
 
201
- deduplication_type.change(update_visibility, inputs=deduplication_type, outputs=[dataset2_tab])
 
 
 
 
202
 
203
  compute_button.click(
204
  fn=perform_deduplication,
 
77
  dataset1_name,
78
  dataset1_split,
79
  dataset1_text_column,
80
+ dataset2_name="",
81
+ dataset2_split="",
82
+ dataset2_text_column="",
83
+ threshold=0.8,
84
  progress=gr.Progress(track_tqdm=True)
85
  ):
86
  # Convert threshold to float
 
112
  # Show deduplicated examples
113
  result_text += "**Examples of duplicates found:**\n\n"
114
  num_examples = min(5, num_duplicates)
 
115
  for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
116
  original_text = texts[original_idx]
117
  duplicate_text = texts[duplicate_idx]
 
120
  result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
121
  result_text += f"**Differences:**\n{differences}\n"
122
  result_text += "-" * 50 + "\n\n"
 
123
 
124
  return result_text
125
 
 
151
  # Show deduplicated examples
152
  result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
153
  num_examples = min(5, num_duplicates)
 
154
  for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
155
  original_idx = duplicate_to_original_mapping[duplicate_idx]
156
  original_text = texts1[original_idx]
 
160
  result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
161
  result_text += f"**Differences:**\n{differences}\n"
162
  result_text += "-" * 50 + "\n\n"
 
163
 
164
  return result_text
165
 
166
  with gr.Blocks() as demo:
167
  gr.Markdown("# Semantic Deduplication")
168
 
169
+ deduplication_type = gr.Radio(
170
+ choices=["Single dataset", "Cross-dataset"],
171
+ label="Deduplication Type",
172
+ value="Single dataset"
173
+ )
174
 
175
+ with gr.Row():
176
+ dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
177
+ dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
178
+ dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
 
179
 
180
+ dataset2_inputs = gr.Column(visible=False)
181
+ with dataset2_inputs:
182
+ gr.Markdown("### Dataset 2")
183
  with gr.Row():
184
+ dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
185
+ dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
186
  dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
187
 
188
+ threshold = gr.Slider(
189
+ minimum=0.0,
190
+ maximum=1.0,
191
+ value=0.8,
192
+ label="Similarity Threshold"
193
+ )
194
 
195
  compute_button = gr.Button("Compute")
196
 
197
  output = gr.Markdown()
198
 
199
+ # Function to update the visibility of dataset2_inputs
200
+ def update_visibility(deduplication_type_value):
201
+ if deduplication_type_value == "Cross-dataset":
202
+ return gr.update(visible=True)
203
  else:
204
+ return gr.update(visible=False)
205
 
206
+ deduplication_type.change(
207
+ update_visibility,
208
+ inputs=deduplication_type,
209
+ outputs=dataset2_inputs
210
+ )
211
 
212
  compute_button.click(
213
  fn=perform_deduplication,