ramalMr commited on
Commit
8115786
·
verified ·
1 Parent(s): a1a44e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -20
app.py CHANGED
@@ -1,42 +1,35 @@
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
- import PyPDF2
4
- import random
5
  import pandas as pd
6
- from io import BytesIO
7
  import csv
8
  import os
9
- import io
10
  import tempfile
11
  import re
12
 
13
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
14
 
15
- def extract_text_from_pdf(file):
16
- pdf_reader = PyPDF2.PdfReader(file)
17
- text = ""
18
- for page in range(len(pdf_reader.pages)):
19
- text += pdf_reader.pages[page].extract_text()
20
- return text
21
 
22
  def save_to_csv(sentence, output, filename="synthetic_data.csv"):
23
  with open(filename, mode='a', newline='', encoding='utf-8') as file:
24
  writer = csv.writer(file)
25
  writer.writerow([sentence, output])
26
 
27
-
28
-
29
- def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
30
- text = extract_text_from_pdf(file)
31
- sentences = text.split('.')
32
  random.shuffle(sentences) # Shuffle sentences
33
 
34
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
35
  fieldnames = ['Original Sentence', 'Generated Sentence']
36
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
37
- writer.writeheader()
38
 
39
- for sentence in sentences:
40
  sentence = sentence.strip()
41
  if not sentence:
42
  continue
@@ -68,16 +61,18 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
68
  tmp_path = tmp.name
69
 
70
  return tmp_path
71
- gr.Interface(
 
72
  fn=generate,
73
  inputs=[
74
- gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]),
75
  gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
76
  gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
77
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
78
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
 
79
  ],
80
- outputs=gr.File(label="Synthetic Data "),
81
  title="SDG",
82
  description="AYE QABIL.",
83
  allow_flagging="never",
 
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
 
 
3
  import pandas as pd
4
+ import random
5
  import csv
6
  import os
7
+ import io
8
  import tempfile
9
  import re
10
 
11
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
12
 
13
+ def extract_data_from_excel(file):
14
+ df = pd.read_excel(file)
15
+ return df.values.tolist()
 
 
 
16
 
17
  def save_to_csv(sentence, output, filename="synthetic_data.csv"):
18
  with open(filename, mode='a', newline='', encoding='utf-8') as file:
19
  writer = csv.writer(file)
20
  writer.writerow([sentence, output])
21
 
22
+ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_sentences=10000):
23
+ data = extract_data_from_excel(file)
24
+ sentences = [str(row) for row in data] # Convert each row to a string
 
 
25
  random.shuffle(sentences) # Shuffle sentences
26
 
27
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
28
  fieldnames = ['Original Sentence', 'Generated Sentence']
29
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
30
+ writer.writeheader()
31
 
32
+ for sentence in sentences[:num_sentences]: # Process the first num_sentences sentences
33
  sentence = sentence.strip()
34
  if not sentence:
35
  continue
 
61
  tmp_path = tmp.name
62
 
63
  return tmp_path
64
+
65
+ gr.Interface(
66
  fn=generate,
67
  inputs=[
68
+ gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx", ".xls"]),
69
  gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
70
  gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
71
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
72
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
73
+ gr.Slider(label="Number of sentences", value=10000, minimum=1, maximum=100000, step=1000, interactive=True, info="The number of sentences to generate"),
74
  ],
75
+ outputs=gr.File(label="Synthetic Data"),
76
  title="SDG",
77
  description="AYE QABIL.",
78
  allow_flagging="never",