ramalMr commited on
Commit
7bc64c3
·
verified ·
1 Parent(s): fceacf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -74
app.py CHANGED
@@ -7,107 +7,85 @@ client = InferenceClient(
7
  "mistralai/Mixtral-8x7B-Instruct-v0.1"
8
  )
9
 
10
- def format_prompt(message, history):
11
- prompt = "<s>"
12
- for user_prompt, bot_response in history:
13
- prompt += f"[INST] {user_prompt} [/INST]"
14
- prompt += f" {bot_response}</s> "
15
- prompt += f"[INST] {message} [/INST]"
16
- return prompt
17
-
18
  def tokenize_sentences(file_content):
19
  sentences = sent_tokenize(file_content.decode())
20
  return sentences
21
 
22
- def generate_synthetic_data(prompt, sentences):
23
  synthetic_data = []
24
- for sentence in sentences:
25
  # Apply the prompt instructions to generate synthetic data from the sentence
26
  synthetic_sentence = f"{prompt}: {sentence}"
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  synthetic_data.append(synthetic_sentence)
28
  return "\n".join(synthetic_data)
29
 
30
- def generate(
31
- prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0, files=None
32
- ):
33
- temperature = float(temperature)
34
- if temperature < 1e-2:
35
- temperature = 1e-2
36
- top_p = float(top_p)
37
 
38
- generate_kwargs = dict(
39
- temperature=temperature,
40
- max_new_tokens=max_new_tokens,
41
- top_p=top_p,
42
- repetition_penalty=repetition_penalty,
43
- do_sample=True,
44
- seed=42,
45
- )
46
 
47
- formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
 
 
48
 
 
 
 
 
 
49
  if files is not None:
50
  file_contents = [file.decode() for file in files]
51
  sentences = []
52
  for content in file_contents:
53
  sentences.extend(tokenize_sentences(content))
54
- synthetic_data = generate_synthetic_data(prompt, sentences)
55
- formatted_prompt += f"\n\nSynthetic data: {synthetic_data}"
56
-
57
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
58
- output = ""
59
-
60
- for response in stream:
61
- output += response.token.text
62
- yield output
63
- return output
64
 
65
  additional_inputs=[
66
- gr.Textbox(
67
- label="System Prompt",
68
- max_lines=1,
69
- interactive=True,
70
- ),
71
  gr.Textbox(
72
  label="Prompt for Synthetic Data Generation",
73
  max_lines=1,
74
  interactive=True,
75
  ),
76
  gr.Slider(
77
- label="Temperature",
78
- value=0.9,
79
- minimum=0.0,
80
- maximum=1.0,
81
- step=0.05,
82
  interactive=True,
83
- info="Higher values produce more diverse outputs",
84
  ),
85
- gr.Slider(
86
- label="Max new tokens",
87
- value=256,
88
- minimum=0,
89
- maximum=5120,
90
- step=64,
91
  interactive=True,
92
- info="The maximum numbers of new tokens",
93
  ),
94
- gr.Slider(
95
- label="Top-p (nucleus sampling)",
96
- value=0.90,
97
- minimum=0.0,
98
- maximum=1,
99
- step=0.05,
100
- interactive=True,
101
- info="Higher values sample more low-probability tokens",
102
- ),
103
- gr.Slider(
104
- label="Repetition penalty",
105
- value=1.2,
106
- minimum=1.0,
107
- maximum=2.0,
108
- step=0.05,
109
  interactive=True,
110
- info="Penalize repeated tokens",
111
  ),
112
  gr.File(
113
  label="Upload PDF or Document",
@@ -117,10 +95,10 @@ additional_inputs=[
117
  )
118
  ]
119
 
120
- gr.ChatInterface(
121
  fn=generate,
122
- chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
123
- additional_inputs=additional_inputs,
124
  title="Synthetic-data-generation-aze",
125
- concurrency_limit=20,
126
  ).launch(show_api=False)
 
7
  "mistralai/Mixtral-8x7B-Instruct-v0.1"
8
  )
9
 
 
 
 
 
 
 
 
 
10
  def tokenize_sentences(file_content):
11
  sentences = sent_tokenize(file_content.decode())
12
  return sentences
13
 
14
+ def generate_synthetic_data(prompt, sentences, data_size, toxicity_level, use_emoji):
15
  synthetic_data = []
16
+ for sentence in sentences[:data_size]:
17
  # Apply the prompt instructions to generate synthetic data from the sentence
18
  synthetic_sentence = f"{prompt}: {sentence}"
19
+
20
+ # Adjust toxicity level
21
+ if toxicity_level == "High":
22
+ synthetic_sentence = add_toxic_content(synthetic_sentence)
23
+ elif toxicity_level == "Low":
24
+ synthetic_sentence = remove_toxic_content(synthetic_sentence)
25
+
26
+ # Add or remove emoji
27
+ if use_emoji:
28
+ synthetic_sentence = add_emojis(synthetic_sentence)
29
+ else:
30
+ synthetic_sentence = remove_emojis(synthetic_sentence)
31
+
32
  synthetic_data.append(synthetic_sentence)
33
  return "\n".join(synthetic_data)
34
 
35
+ def add_toxic_content(text):
36
+ # Add code to make the text more toxic
37
+ return text
 
 
 
 
38
 
39
+ def remove_toxic_content(text):
40
+ # Add code to remove toxic content from the text
41
+ return text
 
 
 
 
 
42
 
43
+ def add_emojis(text):
44
+ # Add code to add emojis to the text
45
+ return text
46
 
47
+ def remove_emojis(text):
48
+ # Add code to remove emojis from the text
49
+ return text
50
+
51
+ def generate(prompt, max_data_size=100, toxicity_level="Neutral", use_emoji=False, files=None):
52
  if files is not None:
53
  file_contents = [file.decode() for file in files]
54
  sentences = []
55
  for content in file_contents:
56
  sentences.extend(tokenize_sentences(content))
57
+ synthetic_data = generate_synthetic_data(prompt, sentences, max_data_size, toxicity_level, use_emoji)
58
+ return synthetic_data
59
+ else:
60
+ return "Please upload a file to generate synthetic data."
 
 
 
 
 
 
61
 
62
  additional_inputs=[
 
 
 
 
 
63
  gr.Textbox(
64
  label="Prompt for Synthetic Data Generation",
65
  max_lines=1,
66
  interactive=True,
67
  ),
68
  gr.Slider(
69
+ label="Max Data Size",
70
+ value=100,
71
+ minimum=10,
72
+ maximum=1000,
73
+ step=10,
74
  interactive=True,
75
+ info="The maximum number of sentences to include in the synthetic data",
76
  ),
77
+ gr.Radio(
78
+ label="Toxicity Level",
79
+ choices=["High", "Low", "Neutral"],
80
+ value="Neutral",
 
 
81
  interactive=True,
82
+ info="Adjust the toxicity level of the synthetic data",
83
  ),
84
+ gr.Checkbox(
85
+ label="Use Emoji",
86
+ value=False,
 
 
 
 
 
 
 
 
 
 
 
 
87
  interactive=True,
88
+ info="Add or remove emojis in the synthetic data",
89
  ),
90
  gr.File(
91
  label="Upload PDF or Document",
 
95
  )
96
  ]
97
 
98
+ gr.Interface(
99
  fn=generate,
100
+ inputs=additional_inputs,
101
+ outputs="text",
102
  title="Synthetic-data-generation-aze",
103
+ description="Generate synthetic data from uploaded files based on a given prompt and customization options.",
104
  ).launch(show_api=False)