ramalMr commited on
Commit
838b223
·
verified ·
1 Parent(s): fd67ce4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -20
app.py CHANGED
@@ -1,11 +1,11 @@
 
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
  import random
4
  import pandas as pd
5
- from io import BytesIO
6
- import csv
7
  import os
8
- import io
9
  import tempfile
10
  import re
11
 
@@ -13,24 +13,20 @@ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
13
 
14
  def extract_sentences_from_excel(file):
15
  df = pd.read_excel(file)
16
- text = ' '.join(df['Unnamed: 1'].astype(str))
17
  sentences = text.split('.')
18
  sentences = [s.strip() for s in sentences if s.strip()]
19
  return sentences
20
 
21
- def save_to_csv(sentence, output, filename="synthetic_data.csv"):
22
- with open(filename, mode='a', newline='', encoding='utf-8') as file:
23
- writer = csv.writer(file)
24
- writer.writerow([sentence, output])
25
 
26
  def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
27
  sentences = extract_sentences_from_excel(file)
 
28
 
29
- with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
30
- fieldnames = ['Original Sentence', 'Generated Sentence']
31
- writer = csv.DictWriter(tmp, fieldnames=fieldnames)
32
- writer.writeheader()
33
-
34
  for sentence in sentences:
35
  sentence = sentence.strip()
36
  if not sentence:
@@ -46,25 +42,26 @@ def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalt
46
  }
47
 
48
  try:
49
- stream = client.text_generation(f"{prompt} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
50
  output = ""
51
  for response in stream:
52
  output += response.token.text
53
 
54
- generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
55
- generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
56
-
57
- for generated_sentence in generated_sentences:
58
- writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
59
 
60
  except Exception as e:
61
  print(f"Error generating data for sentence '{sentence}': {e}")
62
 
 
63
  tmp_path = tmp.name
64
 
65
  return tmp_path
66
 
67
- gr.Interface(
68
  fn=generate,
69
  inputs=[
70
  gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
 
1
+ import json
2
  from huggingface_hub import InferenceClient
3
  import gradio as gr
4
  import random
5
  import pandas as pd
6
+ from io import BytesIO
 
7
  import os
8
+ import io
9
  import tempfile
10
  import re
11
 
 
13
 
14
  def extract_sentences_from_excel(file):
15
  df = pd.read_excel(file)
16
+ text = ' '.join(df['Column_Name'].astype(str))
17
  sentences = text.split('.')
18
  sentences = [s.strip() for s in sentences if s.strip()]
19
  return sentences
20
 
21
+ def save_to_json(data, filename="synthetic_data.json"):
22
+ with open(filename, mode='a', encoding='utf-8') as file:
23
+ json.dump(data, file, indent=4, ensure_ascii=False)
 
24
 
25
  def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
26
  sentences = extract_sentences_from_excel(file)
27
+ data = []
28
 
29
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tmp:
 
 
 
 
30
  for sentence in sentences:
31
  sentence = sentence.strip()
32
  if not sentence:
 
42
  }
43
 
44
  try:
45
+ stream = client.text_generation(f"{prompt} Output the response in JSON format.", **generate_kwargs, stream=True, details=True, return_full_text=False)
46
  output = ""
47
  for response in stream:
48
  output += response.token.text
49
 
50
+ try:
51
+ json_output = json.loads(output)
52
+ data.append({"original_sentence": sentence, "generated_sentence": json_output})
53
+ except json.JSONDecodeError:
54
+ print(f"Error decoding JSON for sentence '{sentence}': {output}")
55
 
56
  except Exception as e:
57
  print(f"Error generating data for sentence '{sentence}': {e}")
58
 
59
+ save_to_json(data, tmp.name)
60
  tmp_path = tmp.name
61
 
62
  return tmp_path
63
 
64
+ gr.Interface(
65
  fn=generate,
66
  inputs=[
67
  gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),