Spaces:

ramalMr
/

data_gen

Sleeping

App Files Files Community

ramalMr commited on Mar 30, 2024

Commit

fceacf5

verified ·

1 Parent(s): 923f75f

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -12

app.py CHANGED Viewed

@@ -1,18 +1,31 @@
 from huggingface_hub import InferenceClient
 import gradio as gr
 client = InferenceClient(
     "mistralai/Mixtral-8x7B-Instruct-v0.1"
 )
 def format_prompt(message, history):
-  prompt = "<s>"
-  for user_prompt, bot_response in history:
-    prompt += f"[INST] {user_prompt} [/INST]"
-    prompt += f" {bot_response}</s> "
-  prompt += f"[INST] {message} [/INST]"
-  return prompt
 def generate(
     prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0, files=None
@@ -34,8 +47,12 @@ def generate(
     formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
     if files is not None:
-        for file in files:
-            formatted_prompt += f"\n\nFile content: {file.decode()}"
     stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
     output = ""
@@ -45,13 +62,17 @@ def generate(
         yield output
     return output
 additional_inputs=[
     gr.Textbox(
         label="System Prompt",
         max_lines=1,
         interactive=True,
     ),
     gr.Slider(
         label="Temperature",
         value=0.9,
@@ -96,8 +117,6 @@ additional_inputs=[
     )
 ]
 gr.ChatInterface(
     fn=generate,
     chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),

 from huggingface_hub import InferenceClient
 import gradio as gr
+import re
+from nltk.tokenize import sent_tokenize
 client = InferenceClient(
     "mistralai/Mixtral-8x7B-Instruct-v0.1"
 )
 def format_prompt(message, history):
+    prompt = "<s>"
+    for user_prompt, bot_response in history:
+        prompt += f"[INST] {user_prompt} [/INST]"
+        prompt += f" {bot_response}</s> "
+    prompt += f"[INST] {message} [/INST]"
+    return prompt
+def tokenize_sentences(file_content):
+    sentences = sent_tokenize(file_content.decode())
+    return sentences
+def generate_synthetic_data(prompt, sentences):
+    synthetic_data = []
+    for sentence in sentences:
+        # Apply the prompt instructions to generate synthetic data from the sentence
+        synthetic_sentence = f"{prompt}: {sentence}"
+        synthetic_data.append(synthetic_sentence)
+    return "\n".join(synthetic_data)
 def generate(
     prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0, files=None
     formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
     if files is not None:
+        file_contents = [file.decode() for file in files]
+        sentences = []
+        for content in file_contents:
+            sentences.extend(tokenize_sentences(content))
+        synthetic_data = generate_synthetic_data(prompt, sentences)
+        formatted_prompt += f"\n\nSynthetic data: {synthetic_data}"
     stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
     output = ""
         yield output
     return output
 additional_inputs=[
     gr.Textbox(
         label="System Prompt",
         max_lines=1,
         interactive=True,
     ),
+    gr.Textbox(
+        label="Prompt for Synthetic Data Generation",
+        max_lines=1,
+        interactive=True,
+    ),
     gr.Slider(
         label="Temperature",
         value=0.9,
     )
 ]
 gr.ChatInterface(
     fn=generate,
     chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),