ramalMr commited on
Commit
fa4d0d9
·
verified ·
1 Parent(s): 3c1274a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -4,6 +4,8 @@ import PyPDF2
4
  import random
5
  import pandas as pd
6
  from io import StringIO
 
 
7
 
8
  # Initialize the inference client with your chosen model
9
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
@@ -15,14 +17,25 @@ def extract_text_from_pdf(file):
15
  text += pdf_reader.pages[page].extract_text()
16
  return text
17
 
18
- def generate_synthetic_data(sentences, temperature, max_new_tokens, top_p, repetition_penalty):
19
- synthetic_data = []
 
 
 
 
 
 
 
 
 
 
 
 
20
  for sentence in sentences:
21
- # Trim whitespace and skip if the sentence is empty
22
  sentence = sentence.strip()
23
  if not sentence:
24
  continue
25
-
26
  generate_kwargs = {
27
  "temperature": temperature,
28
  "max_new_tokens": max_new_tokens,
@@ -37,26 +50,12 @@ def generate_synthetic_data(sentences, temperature, max_new_tokens, top_p, repet
37
  output = ""
38
  for response in stream:
39
  output += response.token.text
40
- synthetic_data.append(output)
41
  except Exception as e:
42
  print(f"Error generating data for sentence '{sentence}': {e}")
43
- # Optionally, append a placeholder or error message to `synthetic_data` to maintain alignment with input sentences
44
- synthetic_data.append(f"Error: {e}")
45
- return synthetic_data
46
-
47
- def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
48
- # Extract text and split into sentences
49
- text = extract_text_from_pdf(file)
50
- sentences = text.split('.')
51
- random.shuffle(sentences) # Shuffle sentences
52
-
53
- synthetic_data = generate_synthetic_data(sentences, temperature, max_new_tokens, top_p, repetition_penalty)
54
 
55
- # Convert synthetic data to CSV
56
- df = pd.DataFrame(synthetic_data, columns=["Synthetic Data"])
57
- csv_buffer = StringIO()
58
- df.to_csv(csv_buffer, index=False)
59
- return gr.File(value=csv_buffer.getvalue(), file_name="synthetic_data.csv")
60
 
61
  gr.Interface(
62
  fn=generate,
@@ -69,6 +68,6 @@ gr.Interface(
69
  ],
70
  outputs="file",
71
  title="Synthetic Data Generation",
72
- description="This tool generates synthetic data from the sentences in your PDF.",
73
  allow_flagging="never",
74
  ).launch()
 
4
  import random
5
  import pandas as pd
6
  from io import StringIO
7
+ import csv
8
+ import os
9
 
10
  # Initialize the inference client with your chosen model
11
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 
17
  text += pdf_reader.pages[page].extract_text()
18
  return text
19
 
20
+ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
21
+ with open(filename, mode='a', newline='', encoding='utf-8') as file:
22
+ writer = csv.writer(file)
23
+ writer.writerow([sentence, output])
24
+
25
+ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
26
+ text = extract_text_from_pdf(file)
27
+ sentences = text.split('.')
28
+ random.shuffle(sentences) # Shuffle sentences
29
+
30
+ # CSV dosyası için başlık
31
+ if not os.path.exists("synthetic_data.csv"):
32
+ save_to_csv("Original Sentence", "Synthetic Data")
33
+
34
  for sentence in sentences:
 
35
  sentence = sentence.strip()
36
  if not sentence:
37
  continue
38
+
39
  generate_kwargs = {
40
  "temperature": temperature,
41
  "max_new_tokens": max_new_tokens,
 
50
  output = ""
51
  for response in stream:
52
  output += response.token.text
53
+ save_to_csv(sentence, output)
54
  except Exception as e:
55
  print(f"Error generating data for sentence '{sentence}': {e}")
56
+ save_to_csv(sentence, f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
57
 
58
+ return gr.File(value="synthetic_data.csv", file_name="synthetic_data.csv")
 
 
 
 
59
 
60
  gr.Interface(
61
  fn=generate,
 
68
  ],
69
  outputs="file",
70
  title="Synthetic Data Generation",
71
+ description="This tool generates synthetic data from the sentences in your PDF and saves it to a CSV file.",
72
  allow_flagging="never",
73
  ).launch()