ramalMr commited on
Commit
79a6f49
·
verified ·
1 Parent(s): 09b14bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -3
app.py CHANGED
@@ -1,6 +1,5 @@
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
- import PyPDF2
4
  import random
5
  import pandas as pd
6
  from io import BytesIO
@@ -22,7 +21,7 @@ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
22
  writer = csv.writer(file)
23
  writer.writerow([sentence, output])
24
 
25
- def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
26
  text = extract_text_from_excel(file)
27
  sentences = text.split('.')
28
  random.shuffle(sentences) # Shuffle sentences
@@ -55,7 +54,10 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
55
  generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
56
  generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
57
 
58
- for generated_sentence in generated_sentences:
 
 
 
59
  writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
60
 
61
  except Exception as e:
@@ -73,6 +75,7 @@ gr.Interface(
73
  gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
74
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
75
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
 
76
  ],
77
  outputs=gr.File(label="Synthetic Data "),
78
  title="SDG",
 
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
 
3
  import random
4
  import pandas as pd
5
  from io import BytesIO
 
21
  writer = csv.writer(file)
22
  writer.writerow([sentence, output])
23
 
24
+ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
25
  text = extract_text_from_excel(file)
26
  sentences = text.split('.')
27
  random.shuffle(sentences) # Shuffle sentences
 
54
  generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
55
  generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
56
 
57
+ for _ in range(num_similar_sentences):
58
+ if not generated_sentences:
59
+ break
60
+ generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
61
  writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
62
 
63
  except Exception as e:
 
75
  gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
76
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
77
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
78
+ gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
79
  ],
80
  outputs=gr.File(label="Synthetic Data "),
81
  title="SDG",