Spaces:

ramalMr
/

data_gen

Running

App Files Files Community

ramalMr commited on Apr 6, 2024

Commit

f5a3917

verified ·

1 Parent(s): cf4b1fe

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -48

app.py CHANGED Viewed

@@ -14,64 +14,55 @@ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 def extract_sentences_from_excel(file):
     df = pd.read_excel(file)
-    text = ' '.join(df['metn'].astype(str))
-    sentences = text.split('.')
-    sentences = [s.strip() for s in sentences if s.strip() and s.strip() != 'nan']
     return sentences
-import re
-def save_to_json(data, filename="synthetic_data.json"):
-    with open(filename, mode='w', encoding='utf-8') as file:
-        json_data = []
-        for item in data:
-            generated_sentences = []
-            confidence_scores = []
-            for match in re.finditer(r"{'generated_sentence': '(.+?)', 'confidence_score': ([\d\.]+)}", item['generated_data']):
-                generated_sentences.append(match.group(1))
-                confidence_scores.append(float(match.group(2)))
-            json_data.append({
-                'original_sentence': item['original_sentence'],
-                'generated_sentences': generated_sentences,
-                'confidence_scores': confidence_scores
-            })
-        json.dump(json_data, file, indent=4, ensure_ascii=False)
 def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
     sentences = extract_sentences_from_excel(file)
     data = []
-    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tmp:
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if not sentence:
-                continue
-            generate_kwargs = {
-                "temperature": temperature,
-                "max_new_tokens": max_new_tokens,
-                "top_p": top_p,
-                "repetition_penalty": repetition_penalty,
-                "do_sample": True,
-                "seed": 42,
-            }
-            try:
-                stream = client.text_generation(f"{prompt} Output the response in the following JSON format: {{'generated_sentence': 'The generated sentence text', 'confidence_score': 0.9}} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
-                output = ""
-                for response in stream:
-                    output += response.token.text
-                data.append({"original_sentence": sentence, "generated_data": output})
-            except Exception as e:
-                print(f"Error generating data for sentence '{sentence}': {e}")
-        save_to_json(data, tmp.name)
-        tmp_path = tmp.name
-    return tmp_path
 gr.Interface(
     fn=generate,
     inputs=[
@@ -82,8 +73,8 @@ gr.Interface(
         gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
         gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
     ],
-    outputs=gr.File(label="Synthetic Data "),
     title="SDG",
-    description="AYE QABIL.",
     allow_flagging="never",
 ).launch()

 def extract_sentences_from_excel(file):
     df = pd.read_excel(file)
+    sentences = df['metn'].astype(str).tolist()
     return sentences
 def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
     sentences = extract_sentences_from_excel(file)
     data = []
+    generate_kwargs = {
+        "temperature": temperature,
+        "max_new_tokens": max_new_tokens,
+        "top_p": top_p,
+        "repetition_penalty": repetition_penalty,
+        "do_sample": True,
+        "seed": 42,
+    }
+    for sentence in sentences:
+        try:
+            stream = client.text_generation(f"{prompt} Output the response in the following JSON format: {{'generated_sentence': 'The generated sentence text', 'confidence_score': 0.9}} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
+            output = ""
+            for response in stream:
+                output += response.token.text
+            data.append({"original_sentence": sentence, "generated_data": output})
+        except Exception as e:
+            print(f"Error generating data for sentence '{sentence}': {e}")
+    filename = "synthetic_data.json"
+    save_to_json(data, filename)
+    return filename
+def save_to_json(data, filename):
+    json_data = []
+    for item in data:
+        generated_sentences = []
+        confidence_scores = []
+        for match in re.finditer(r"{'generated_sentence': '(.+?)', 'confidence_score': ([\d\.]+)}", item['generated_data']):
+            generated_sentences.append(match.group(1))
+            confidence_scores.append(float(match.group(2)))
+        json_data.append({
+            'original_sentence': item['original_sentence'],
+            'generated_sentences': generated_sentences,
+            'confidence_scores': confidence_scores
+        })
+    with open(filename, mode='w', encoding='utf-8') as file:
+        json.dump(json_data, file, indent=4, ensure_ascii=False)
+# Gradio arayüzü
 gr.Interface(
     fn=generate,
     inputs=[
         gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
         gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
     ],
+    outputs=gr.File(label="Synthetic Data"),
     title="SDG",
+    description=" *AYE* QABIL.",
     allow_flagging="never",
 ).launch()