ramalMr commited on
Commit
1fd65af
·
verified ·
1 Parent(s): adba430

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -31
app.py CHANGED
@@ -8,14 +8,6 @@ from io import StringIO
8
  # Initialize the inference client with your chosen model
9
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
10
 
11
- def format_prompt(message, history):
12
- prompt = "<s>"
13
- for user_prompt, bot_response in history:
14
- prompt += f"[INST] {user_prompt} [/INST]"
15
- prompt += f" {bot_response}</s> "
16
- prompt += f"[INST] {message} [/INST]"
17
- return prompt
18
-
19
  def extract_text_from_pdf(file):
20
  pdf_reader = PyPDF2.PdfReader(file)
21
  text = ""
@@ -23,10 +15,18 @@ def extract_text_from_pdf(file):
23
  text += pdf_reader.pages[page].extract_text()
24
  return text
25
 
26
- def generate_synthetic_data(sentences, generate_kwargs):
27
  synthetic_data = []
28
  for sentence in sentences:
29
- formatted_prompt = format_prompt(sentence, [])
 
 
 
 
 
 
 
 
30
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
31
  output = ""
32
  for response in stream:
@@ -34,22 +34,13 @@ def generate_synthetic_data(sentences, generate_kwargs):
34
  synthetic_data.append(output)
35
  return synthetic_data
36
 
37
- def generate(file, system_prompt, temperature, max_new_tokens, top_p, repetition_penalty):
38
  # Extract text and split into sentences
39
  text = extract_text_from_pdf(file)
40
  sentences = text.split('.')
41
  random.shuffle(sentences) # Shuffle sentences
42
 
43
- generate_kwargs = dict(
44
- temperature=temperature,
45
- max_new_tokens=max_new_tokens,
46
- top_p=top_p,
47
- repetition_penalty=repetition_penalty,
48
- do_sample=True,
49
- seed=42,
50
- )
51
-
52
- synthetic_data = generate_synthetic_data(sentences, generate_kwargs)
53
 
54
  # Convert synthetic data to CSV
55
  df = pd.DataFrame(synthetic_data, columns=["Synthetic Data"])
@@ -57,19 +48,16 @@ def generate(file, system_prompt, temperature, max_new_tokens, top_p, repetition
57
  df.to_csv(csv_buffer, index=False)
58
  return gr.File(value=csv_buffer.getvalue(), file_name="synthetic_data.csv")
59
 
60
- additional_inputs = [
61
- gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
62
- gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
63
- gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
64
- gr.Slider(label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens"),
65
- gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]),
66
- ]
67
-
68
  gr.Interface(
69
  fn=generate,
70
- inputs=[gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]), "state", "number", "number", "number", "number"],
 
 
 
 
 
 
71
  outputs="file",
72
- additional_inputs=additional_inputs,
73
  title="Synthetic Data Generation",
74
  description="This tool generates synthetic data from the sentences in your PDF.",
75
  allow_flagging="never",
 
8
  # Initialize the inference client with your chosen model
9
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
10
 
 
 
 
 
 
 
 
 
11
  def extract_text_from_pdf(file):
12
  pdf_reader = PyPDF2.PdfReader(file)
13
  text = ""
 
15
  text += pdf_reader.pages[page].extract_text()
16
  return text
17
 
18
+ def generate_synthetic_data(sentences, temperature, max_new_tokens, top_p, repetition_penalty):
19
  synthetic_data = []
20
  for sentence in sentences:
21
+ generate_kwargs = {
22
+ "temperature": temperature,
23
+ "max_new_tokens": max_new_tokens,
24
+ "top_p": top_p,
25
+ "repetition_penalty": repetition_penalty,
26
+ "do_sample": True,
27
+ "seed": 42,
28
+ }
29
+ formatted_prompt = sentence # Using the sentence directly as the prompt
30
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
31
  output = ""
32
  for response in stream:
 
34
  synthetic_data.append(output)
35
  return synthetic_data
36
 
37
+ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
38
  # Extract text and split into sentences
39
  text = extract_text_from_pdf(file)
40
  sentences = text.split('.')
41
  random.shuffle(sentences) # Shuffle sentences
42
 
43
+ synthetic_data = generate_synthetic_data(sentences, temperature, max_new_tokens, top_p, repetition_penalty)
 
 
 
 
 
 
 
 
 
44
 
45
  # Convert synthetic data to CSV
46
  df = pd.DataFrame(synthetic_data, columns=["Synthetic Data"])
 
48
  df.to_csv(csv_buffer, index=False)
49
  return gr.File(value=csv_buffer.getvalue(), file_name="synthetic_data.csv")
50
 
 
 
 
 
 
 
 
 
51
  gr.Interface(
52
  fn=generate,
53
+ inputs=[
54
+ gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]),
55
+ gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
56
+ gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
57
+ gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
58
+ gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
59
+ ],
60
  outputs="file",
 
61
  title="Synthetic Data Generation",
62
  description="This tool generates synthetic data from the sentences in your PDF.",
63
  allow_flagging="never",