Updated app with code for deduplication
Browse files
app.py
CHANGED
@@ -77,10 +77,10 @@ def perform_deduplication(
|
|
77 |
dataset1_name,
|
78 |
dataset1_split,
|
79 |
dataset1_text_column,
|
80 |
-
dataset2_name,
|
81 |
-
dataset2_split,
|
82 |
-
dataset2_text_column,
|
83 |
-
threshold,
|
84 |
progress=gr.Progress(track_tqdm=True)
|
85 |
):
|
86 |
# Convert threshold to float
|
@@ -112,7 +112,6 @@ def perform_deduplication(
|
|
112 |
# Show deduplicated examples
|
113 |
result_text += "**Examples of duplicates found:**\n\n"
|
114 |
num_examples = min(5, num_duplicates)
|
115 |
-
examples_shown = 0
|
116 |
for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
|
117 |
original_text = texts[original_idx]
|
118 |
duplicate_text = texts[duplicate_idx]
|
@@ -121,7 +120,6 @@ def perform_deduplication(
|
|
121 |
result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
|
122 |
result_text += f"**Differences:**\n{differences}\n"
|
123 |
result_text += "-" * 50 + "\n\n"
|
124 |
-
examples_shown += 1
|
125 |
|
126 |
return result_text
|
127 |
|
@@ -153,7 +151,6 @@ def perform_deduplication(
|
|
153 |
# Show deduplicated examples
|
154 |
result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
|
155 |
num_examples = min(5, num_duplicates)
|
156 |
-
examples_shown = 0
|
157 |
for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
|
158 |
original_idx = duplicate_to_original_mapping[duplicate_idx]
|
159 |
original_text = texts1[original_idx]
|
@@ -163,42 +160,54 @@ def perform_deduplication(
|
|
163 |
result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
|
164 |
result_text += f"**Differences:**\n{differences}\n"
|
165 |
result_text += "-" * 50 + "\n\n"
|
166 |
-
examples_shown += 1
|
167 |
|
168 |
return result_text
|
169 |
|
170 |
with gr.Blocks() as demo:
|
171 |
gr.Markdown("# Semantic Deduplication")
|
172 |
|
173 |
-
deduplication_type = gr.Radio(
|
|
|
|
|
|
|
|
|
174 |
|
175 |
-
with gr.
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
|
180 |
|
181 |
-
|
182 |
-
with
|
|
|
183 |
with gr.Row():
|
184 |
-
dataset2_name = gr.Textbox(value="ag_news", label="Dataset Name")
|
185 |
-
dataset2_split = gr.Textbox(value="test", label="Split")
|
186 |
dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
|
187 |
|
188 |
-
threshold = gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
compute_button = gr.Button("Compute")
|
191 |
|
192 |
output = gr.Markdown()
|
193 |
|
194 |
-
# Function to update the visibility of
|
195 |
-
def update_visibility(
|
196 |
-
if
|
197 |
-
return
|
198 |
else:
|
199 |
-
return
|
200 |
|
201 |
-
deduplication_type.change(
|
|
|
|
|
|
|
|
|
202 |
|
203 |
compute_button.click(
|
204 |
fn=perform_deduplication,
|
|
|
77 |
dataset1_name,
|
78 |
dataset1_split,
|
79 |
dataset1_text_column,
|
80 |
+
dataset2_name="",
|
81 |
+
dataset2_split="",
|
82 |
+
dataset2_text_column="",
|
83 |
+
threshold=0.8,
|
84 |
progress=gr.Progress(track_tqdm=True)
|
85 |
):
|
86 |
# Convert threshold to float
|
|
|
112 |
# Show deduplicated examples
|
113 |
result_text += "**Examples of duplicates found:**\n\n"
|
114 |
num_examples = min(5, num_duplicates)
|
|
|
115 |
for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
|
116 |
original_text = texts[original_idx]
|
117 |
duplicate_text = texts[duplicate_idx]
|
|
|
120 |
result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
|
121 |
result_text += f"**Differences:**\n{differences}\n"
|
122 |
result_text += "-" * 50 + "\n\n"
|
|
|
123 |
|
124 |
return result_text
|
125 |
|
|
|
151 |
# Show deduplicated examples
|
152 |
result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
|
153 |
num_examples = min(5, num_duplicates)
|
|
|
154 |
for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
|
155 |
original_idx = duplicate_to_original_mapping[duplicate_idx]
|
156 |
original_text = texts1[original_idx]
|
|
|
160 |
result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
|
161 |
result_text += f"**Differences:**\n{differences}\n"
|
162 |
result_text += "-" * 50 + "\n\n"
|
|
|
163 |
|
164 |
return result_text
|
165 |
|
166 |
with gr.Blocks() as demo:
|
167 |
gr.Markdown("# Semantic Deduplication")
|
168 |
|
169 |
+
deduplication_type = gr.Radio(
|
170 |
+
choices=["Single dataset", "Cross-dataset"],
|
171 |
+
label="Deduplication Type",
|
172 |
+
value="Single dataset"
|
173 |
+
)
|
174 |
|
175 |
+
with gr.Row():
|
176 |
+
dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
|
177 |
+
dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
|
178 |
+
dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
|
|
|
179 |
|
180 |
+
dataset2_inputs = gr.Column(visible=False)
|
181 |
+
with dataset2_inputs:
|
182 |
+
gr.Markdown("### Dataset 2")
|
183 |
with gr.Row():
|
184 |
+
dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
|
185 |
+
dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
|
186 |
dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
|
187 |
|
188 |
+
threshold = gr.Slider(
|
189 |
+
minimum=0.0,
|
190 |
+
maximum=1.0,
|
191 |
+
value=0.8,
|
192 |
+
label="Similarity Threshold"
|
193 |
+
)
|
194 |
|
195 |
compute_button = gr.Button("Compute")
|
196 |
|
197 |
output = gr.Markdown()
|
198 |
|
199 |
+
# Function to update the visibility of dataset2_inputs
|
200 |
+
def update_visibility(deduplication_type_value):
|
201 |
+
if deduplication_type_value == "Cross-dataset":
|
202 |
+
return gr.update(visible=True)
|
203 |
else:
|
204 |
+
return gr.update(visible=False)
|
205 |
|
206 |
+
deduplication_type.change(
|
207 |
+
update_visibility,
|
208 |
+
inputs=deduplication_type,
|
209 |
+
outputs=dataset2_inputs
|
210 |
+
)
|
211 |
|
212 |
compute_button.click(
|
213 |
fn=perform_deduplication,
|