ramalMr commited on
Commit
f5a3917
·
verified ·
1 Parent(s): cf4b1fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -48
app.py CHANGED
@@ -14,64 +14,55 @@ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
14
 
15
  def extract_sentences_from_excel(file):
16
  df = pd.read_excel(file)
17
- text = ' '.join(df['metn'].astype(str))
18
- sentences = text.split('.')
19
- sentences = [s.strip() for s in sentences if s.strip() and s.strip() != 'nan']
20
  return sentences
21
 
22
- import re
23
-
24
- def save_to_json(data, filename="synthetic_data.json"):
25
- with open(filename, mode='w', encoding='utf-8') as file:
26
- json_data = []
27
- for item in data:
28
- generated_sentences = []
29
- confidence_scores = []
30
- for match in re.finditer(r"{'generated_sentence': '(.+?)', 'confidence_score': ([\d\.]+)}", item['generated_data']):
31
- generated_sentences.append(match.group(1))
32
- confidence_scores.append(float(match.group(2)))
33
- json_data.append({
34
- 'original_sentence': item['original_sentence'],
35
- 'generated_sentences': generated_sentences,
36
- 'confidence_scores': confidence_scores
37
- })
38
- json.dump(json_data, file, indent=4, ensure_ascii=False)
39
-
40
  def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
41
  sentences = extract_sentences_from_excel(file)
42
  data = []
43
 
44
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tmp:
45
- for sentence in sentences:
46
- sentence = sentence.strip()
47
- if not sentence:
48
- continue
49
-
50
- generate_kwargs = {
51
- "temperature": temperature,
52
- "max_new_tokens": max_new_tokens,
53
- "top_p": top_p,
54
- "repetition_penalty": repetition_penalty,
55
- "do_sample": True,
56
- "seed": 42,
57
- }
58
 
59
- try:
60
- stream = client.text_generation(f"{prompt} Output the response in the following JSON format: {{'generated_sentence': 'The generated sentence text', 'confidence_score': 0.9}} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
61
- output = ""
62
- for response in stream:
63
- output += response.token.text
 
 
 
 
64
 
65
- data.append({"original_sentence": sentence, "generated_data": output})
 
66
 
67
- except Exception as e:
68
- print(f"Error generating data for sentence '{sentence}': {e}")
69
 
70
- save_to_json(data, tmp.name)
71
- tmp_path = tmp.name
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- return tmp_path
 
74
 
 
75
  gr.Interface(
76
  fn=generate,
77
  inputs=[
@@ -82,8 +73,8 @@ gr.Interface(
82
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
83
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
84
  ],
85
- outputs=gr.File(label="Synthetic Data "),
86
  title="SDG",
87
- description="AYE QABIL.",
88
  allow_flagging="never",
89
  ).launch()
 
14
 
15
  def extract_sentences_from_excel(file):
16
  df = pd.read_excel(file)
17
+ sentences = df['metn'].astype(str).tolist()
 
 
18
  return sentences
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
21
  sentences = extract_sentences_from_excel(file)
22
  data = []
23
 
24
+ generate_kwargs = {
25
+ "temperature": temperature,
26
+ "max_new_tokens": max_new_tokens,
27
+ "top_p": top_p,
28
+ "repetition_penalty": repetition_penalty,
29
+ "do_sample": True,
30
+ "seed": 42,
31
+ }
 
 
 
 
 
 
32
 
33
+ for sentence in sentences:
34
+ try:
35
+ stream = client.text_generation(f"{prompt} Output the response in the following JSON format: {{'generated_sentence': 'The generated sentence text', 'confidence_score': 0.9}} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
36
+ output = ""
37
+ for response in stream:
38
+ output += response.token.text
39
+ data.append({"original_sentence": sentence, "generated_data": output})
40
+ except Exception as e:
41
+ print(f"Error generating data for sentence '{sentence}': {e}")
42
 
43
+ filename = "synthetic_data.json"
44
+ save_to_json(data, filename)
45
 
46
+ return filename
 
47
 
48
+ def save_to_json(data, filename):
49
+ json_data = []
50
+ for item in data:
51
+ generated_sentences = []
52
+ confidence_scores = []
53
+ for match in re.finditer(r"{'generated_sentence': '(.+?)', 'confidence_score': ([\d\.]+)}", item['generated_data']):
54
+ generated_sentences.append(match.group(1))
55
+ confidence_scores.append(float(match.group(2)))
56
+ json_data.append({
57
+ 'original_sentence': item['original_sentence'],
58
+ 'generated_sentences': generated_sentences,
59
+ 'confidence_scores': confidence_scores
60
+ })
61
 
62
+ with open(filename, mode='w', encoding='utf-8') as file:
63
+ json.dump(json_data, file, indent=4, ensure_ascii=False)
64
 
65
+ # Gradio arayüzü
66
  gr.Interface(
67
  fn=generate,
68
  inputs=[
 
73
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
74
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
75
  ],
76
+ outputs=gr.File(label="Synthetic Data"),
77
  title="SDG",
78
+ description=" *AYE* QABIL.",
79
  allow_flagging="never",
80
  ).launch()