ramalMr commited on
Commit
adba430
·
verified ·
1 Parent(s): 12b3267

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -75
app.py CHANGED
@@ -1,10 +1,12 @@
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
  import PyPDF2
 
 
 
4
 
5
- client = InferenceClient(
6
- "mistralai/Mixtral-8x7B-Instruct-v0.1"
7
- )
8
 
9
  def format_prompt(message, history):
10
  prompt = "<s>"
@@ -14,13 +16,29 @@ def format_prompt(message, history):
14
  prompt += f"[INST] {message} [/INST]"
15
  return prompt
16
 
17
- def generate(
18
- prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0, file=None
19
- ):
20
- temperature = float(temperature)
21
- if temperature < 1e-2:
22
- temperature = 1e-2
23
- top_p = float(top_p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  generate_kwargs = dict(
26
  temperature=temperature,
@@ -31,75 +49,28 @@ def generate(
31
  seed=42,
32
  )
33
 
34
- if file:
35
- text = extract_text_from_pdf(file)
36
- prompt = text
37
 
38
- formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
39
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
40
- output = ""
41
-
42
- for response in stream:
43
- output += response.token.text
44
- yield output
45
- return output
46
-
47
- def extract_text_from_pdf(file):
48
- pdf_reader = PyPDF2.PdfReader(file)
49
- text = ""
50
- for page in range(len(pdf_reader.pages)):
51
- text += pdf_reader.pages[page].extract_text()
52
- return text
53
 
54
- additional_inputs=[
55
- gr.Textbox(
56
- label="System Prompt",
57
- max_lines=1,
58
- interactive=True,
59
- ),
60
- gr.Slider(
61
- label="Temperature",
62
- value=0.9,
63
- minimum=0.0,
64
- maximum=1.0,
65
- step=0.05,
66
- interactive=True,
67
- info="Higher values produce more diverse outputs",
68
- ),
69
- gr.Slider(
70
- label="Max new tokens",
71
- value=256,
72
- minimum=0,
73
- maximum=5120,
74
- step=64,
75
- interactive=True,
76
- info="The maximum numbers of new tokens",
77
- ),
78
- gr.Slider(
79
- label="Top-p (nucleus sampling)",
80
- value=0.90,
81
- minimum=0.0,
82
- maximum=1,
83
- step=0.05,
84
- interactive=True,
85
- info="Higher values sample more low-probability tokens",
86
- ),
87
- gr.Slider(
88
- label="Repetition penalty",
89
- value=1.2,
90
- minimum=1.0,
91
- maximum=2.0,
92
- step=0.05,
93
- interactive=True,
94
- info="Penalize repeated tokens",
95
- ),
96
  gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]),
97
  ]
98
 
99
- gr.ChatInterface(
100
  fn=generate,
101
- chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
 
102
  additional_inputs=additional_inputs,
103
- title="Synthetic-data-generation-aze",
104
- concurrency_limit=20,
105
- ).launch(show_api=False)
 
 
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
  import PyPDF2
4
+ import random
5
+ import pandas as pd
6
+ from io import StringIO
7
 
8
+ # Initialize the inference client with your chosen model
9
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 
10
 
11
  def format_prompt(message, history):
12
  prompt = "<s>"
 
16
  prompt += f"[INST] {message} [/INST]"
17
  return prompt
18
 
19
+ def extract_text_from_pdf(file):
20
+ pdf_reader = PyPDF2.PdfReader(file)
21
+ text = ""
22
+ for page in range(len(pdf_reader.pages)):
23
+ text += pdf_reader.pages[page].extract_text()
24
+ return text
25
+
26
+ def generate_synthetic_data(sentences, generate_kwargs):
27
+ synthetic_data = []
28
+ for sentence in sentences:
29
+ formatted_prompt = format_prompt(sentence, [])
30
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
31
+ output = ""
32
+ for response in stream:
33
+ output += response.token.text
34
+ synthetic_data.append(output)
35
+ return synthetic_data
36
+
37
+ def generate(file, system_prompt, temperature, max_new_tokens, top_p, repetition_penalty):
38
+ # Extract text and split into sentences
39
+ text = extract_text_from_pdf(file)
40
+ sentences = text.split('.')
41
+ random.shuffle(sentences) # Shuffle sentences
42
 
43
  generate_kwargs = dict(
44
  temperature=temperature,
 
49
  seed=42,
50
  )
51
 
52
+ synthetic_data = generate_synthetic_data(sentences, generate_kwargs)
 
 
53
 
54
+ # Convert synthetic data to CSV
55
+ df = pd.DataFrame(synthetic_data, columns=["Synthetic Data"])
56
+ csv_buffer = StringIO()
57
+ df.to_csv(csv_buffer, index=False)
58
+ return gr.File(value=csv_buffer.getvalue(), file_name="synthetic_data.csv")
 
 
 
 
 
 
 
 
 
 
59
 
60
+ additional_inputs = [
61
+ gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
62
+ gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
63
+ gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
64
+ gr.Slider(label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]),
66
  ]
67
 
68
+ gr.Interface(
69
  fn=generate,
70
+ inputs=[gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]), "state", "number", "number", "number", "number"],
71
+ outputs="file",
72
  additional_inputs=additional_inputs,
73
+ title="Synthetic Data Generation",
74
+ description="This tool generates synthetic data from the sentences in your PDF.",
75
+ allow_flagging="never",
76
+ ).launch()