ramalMr commited on
Commit
e798af8
·
verified ·
1 Parent(s): 45b3e18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -5
app.py CHANGED
@@ -10,7 +10,6 @@ import io
10
  import tempfile
11
  import re
12
 
13
- # Initialize the inference client with your chosen model
14
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
15
 
16
  def extract_text_from_pdf(file):
@@ -30,9 +29,8 @@ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
30
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
31
  text = extract_text_from_pdf(file)
32
  sentences = text.split('.')
33
- random.shuffle(sentences)
34
 
35
-
36
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
37
  fieldnames = ['Original Sentence', 'Generated Sentence']
38
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
@@ -58,9 +56,8 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
58
  for response in stream:
59
  output += response.token.text
60
 
61
-
62
  generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
63
- generated_sentences = [s.strip() for s in generated_sentences if s.strip()]
64
 
65
  for generated_sentence in generated_sentences:
66
  writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
 
10
  import tempfile
11
  import re
12
 
 
13
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
14
 
15
  def extract_text_from_pdf(file):
 
29
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
30
  text = extract_text_from_pdf(file)
31
  sentences = text.split('.')
32
+ random.shuffle(sentences) # Shuffle sentences
33
 
 
34
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
35
  fieldnames = ['Original Sentence', 'Generated Sentence']
36
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
 
56
  for response in stream:
57
  output += response.token.text
58
 
 
59
  generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
60
+ generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
61
 
62
  for generated_sentence in generated_sentences:
63
  writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})