ramalMr commited on
Commit
18cb91d
·
verified ·
1 Parent(s): 4cab160

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -38
app.py CHANGED
@@ -2,62 +2,69 @@ from huggingface_hub import InferenceClient
2
  import gradio as gr
3
  import random
4
  import pandas as pd
5
- from io import BytesIO
 
 
 
 
 
6
 
7
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
8
 
9
  def extract_text_from_excel(file):
10
  df = pd.read_excel(file)
11
- text = ' '.join(df['data'].astype(str))
12
  return text
13
 
14
- def generate_sentences(text, temperature, max_new_tokens, top_p, repetition_penalty):
 
 
 
 
 
 
15
  sentences = text.split('.')
16
  random.shuffle(sentences) # Shuffle sentences
17
 
18
- generated_data = []
 
 
 
19
 
20
- for sentence in sentences:
21
- sentence = sentence.strip()
22
- if not sentence:
23
- continue
24
 
25
- generate_kwargs = {
26
- "temperature": temperature,
27
- "max_new_tokens": max_new_tokens,
28
- "top_p": top_p,
29
- "repetition_penalty": repetition_penalty,
30
- "do_sample": True,
31
- "seed": 42,
32
- }
33
 
34
- try:
35
- stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
36
- output = ""
37
- for response in stream:
38
- output += response.token.text
39
 
40
- generated_sentences = [s.strip() for s in output.split('.') if s.strip()]
41
- generated_data.extend([(sentence, generated_sentence) for generated_sentence in generated_sentences])
42
 
43
- except Exception as e:
44
- print(f"Error generating data for sentence '{sentence}': {e}")
45
 
46
- return generated_data
 
47
 
48
- def save_to_csv(data, filename="synthetic_data.csv"):
49
- with open(filename, mode='w', newline='', encoding='utf-8') as file:
50
- writer = csv.writer(file)
51
- writer.writerow(['Original Sentence', 'Generated Sentence'])
52
- writer.writerows(data)
53
 
54
- def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
55
- text = extract_text_from_excel(file)
56
- data = generate_sentences(text, temperature, max_new_tokens, top_p, repetition_penalty)
57
- save_to_csv(data)
58
- return gr.File.update(value=filename, visible=True)
59
 
60
- gr.Interface(
61
  fn=generate,
62
  inputs=[
63
  gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
@@ -66,7 +73,7 @@ gr.Interface(
66
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
67
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
68
  ],
69
- outputs=gr.File(label="Synthetic Data"),
70
  title="SDG",
71
  description="AYE QABIL.",
72
  allow_flagging="never",
 
2
  import gradio as gr
3
  import random
4
  import pandas as pd
5
+ from io import BytesIO
6
+ import csv
7
+ import os
8
+ import io
9
+ import tempfile
10
+ import re
11
 
12
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
13
 
14
  def extract_text_from_excel(file):
15
  df = pd.read_excel(file)
16
+ text = ' '.join(df['Unnamed: 1'].astype(str))
17
  return text
18
 
19
+ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
20
+ with open(filename, mode='a', newline='', encoding='utf-8') as file:
21
+ writer = csv.writer(file)
22
+ writer.writerow([sentence, output])
23
+
24
+ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
25
+ text = extract_text_from_excel(file)
26
  sentences = text.split('.')
27
  random.shuffle(sentences) # Shuffle sentences
28
 
29
+ with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
30
+ fieldnames = ['Original Sentence', 'Generated Sentence']
31
+ writer = csv.DictWriter(tmp, fieldnames=fieldnames)
32
+ writer.writeheader()
33
 
34
+ for sentence in sentences:
35
+ sentence = sentence.strip()
36
+ if not sentence:
37
+ continue
38
 
39
+ generate_kwargs = {
40
+ "temperature": temperature,
41
+ "max_new_tokens": max_new_tokens,
42
+ "top_p": top_p,
43
+ "repetition_penalty": repetition_penalty,
44
+ "do_sample": True,
45
+ "seed": 42,
46
+ }
47
 
48
+ try:
49
+ stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
50
+ output = ""
51
+ for response in stream:
52
+ output += response.token.text
53
 
54
+ generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
55
+ generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
56
 
57
+ for generated_sentence in generated_sentences:
58
+ writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
59
 
60
+ except Exception as e:
61
+ print(f"Error generating data for sentence '{sentence}': {e}")
62
 
63
+ tmp_path = tmp.name
 
 
 
 
64
 
65
+ return tmp_path
 
 
 
 
66
 
67
+ gr.Interface(
68
  fn=generate,
69
  inputs=[
70
  gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
 
73
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
74
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
75
  ],
76
+ outputs=gr.File(label="Synthetic Data "),
77
  title="SDG",
78
  description="AYE QABIL.",
79
  allow_flagging="never",