ramalMr commited on
Commit
a5056fa
·
verified ·
1 Parent(s): e06c59d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -4
app.py CHANGED
@@ -8,6 +8,8 @@ import csv
8
  import os
9
  import io
10
  import tempfile
 
 
11
  # Initialize the inference client with your chosen model
12
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
13
 
@@ -24,15 +26,18 @@ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
24
  writer.writerow([sentence, output])
25
 
26
 
 
27
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
28
  text = extract_text_from_pdf(file)
29
  sentences = text.split('.')
30
  random.shuffle(sentences) # Shuffle sentences
31
 
32
-
33
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
34
- writer = csv.writer(tmp)
35
-
 
 
36
  for sentence in sentences:
37
  sentence = sentence.strip()
38
  if not sentence:
@@ -52,7 +57,15 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
52
  output = ""
53
  for response in stream:
54
  output += response.token.text
55
- writer.writerow([sentence, output])
 
 
 
 
 
 
 
 
56
  except Exception as e:
57
  print(f"Error generating data for sentence '{sentence}': {e}")
58
 
 
8
  import os
9
  import io
10
  import tempfile
11
+ import re
12
+
13
  # Initialize the inference client with your chosen model
14
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
15
 
 
26
  writer.writerow([sentence, output])
27
 
28
 
29
+
30
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
31
  text = extract_text_from_pdf(file)
32
  sentences = text.split('.')
33
  random.shuffle(sentences) # Shuffle sentences
34
 
35
+ # Geçici dosya oluştur ve CSV yazıcısını başlat
36
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
37
+ fieldnames = ['Original Sentence', 'Generated Sentence']
38
+ writer = csv.DictWriter(tmp, fieldnames=fieldnames)
39
+ writer.writeheader() # CSV dosyasına kolon isimleri yazılır
40
+
41
  for sentence in sentences:
42
  sentence = sentence.strip()
43
  if not sentence:
 
57
  output = ""
58
  for response in stream:
59
  output += response.token.text
60
+
61
+ # Modelden gelen yanıtı cümlelere ayır
62
+ generated_sentences = re.split(r'[\.\?!]', output)
63
+ generated_sentences = [s.strip() for s in generated_sentences if s.strip()]
64
+
65
+ # Her cümleyi ayrı bir satır olarak CSV'ye yaz
66
+ for generated_sentence in generated_sentences:
67
+ writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
68
+
69
  except Exception as e:
70
  print(f"Error generating data for sentence '{sentence}': {e}")
71