Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from huggingface_hub import InferenceClient
|
2 |
import gradio as gr
|
3 |
-
import PyPDF2
|
4 |
import random
|
5 |
import pandas as pd
|
6 |
from io import BytesIO
|
@@ -22,7 +21,7 @@ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
|
|
22 |
writer = csv.writer(file)
|
23 |
writer.writerow([sentence, output])
|
24 |
|
25 |
-
def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
|
26 |
text = extract_text_from_excel(file)
|
27 |
sentences = text.split('.')
|
28 |
random.shuffle(sentences) # Shuffle sentences
|
@@ -55,7 +54,10 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
|
|
55 |
generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
|
56 |
generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
|
57 |
|
58 |
-
for
|
|
|
|
|
|
|
59 |
writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
|
60 |
|
61 |
except Exception as e:
|
@@ -73,6 +75,7 @@ gr.Interface(
|
|
73 |
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
|
74 |
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
|
75 |
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
|
|
|
76 |
],
|
77 |
outputs=gr.File(label="Synthetic Data "),
|
78 |
title="SDG",
|
|
|
1 |
from huggingface_hub import InferenceClient
|
2 |
import gradio as gr
|
|
|
3 |
import random
|
4 |
import pandas as pd
|
5 |
from io import BytesIO
|
|
|
21 |
writer = csv.writer(file)
|
22 |
writer.writerow([sentence, output])
|
23 |
|
24 |
+
def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
|
25 |
text = extract_text_from_excel(file)
|
26 |
sentences = text.split('.')
|
27 |
random.shuffle(sentences) # Shuffle sentences
|
|
|
54 |
generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
|
55 |
generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
|
56 |
|
57 |
+
for _ in range(num_similar_sentences):
|
58 |
+
if not generated_sentences:
|
59 |
+
break
|
60 |
+
generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
|
61 |
writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
|
62 |
|
63 |
except Exception as e:
|
|
|
75 |
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
|
76 |
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
|
77 |
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
|
78 |
+
gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
|
79 |
],
|
80 |
outputs=gr.File(label="Synthetic Data "),
|
81 |
title="SDG",
|