ramalMr commited on
Commit
56c5fd5
·
verified ·
1 Parent(s): 18cb91d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -3
app.py CHANGED
@@ -8,7 +8,7 @@ import os
8
  import io
9
  import tempfile
10
  import re
11
-
12
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
13
 
14
  def extract_text_from_excel(file):
@@ -21,6 +21,16 @@ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
21
  writer = csv.writer(file)
22
  writer.writerow([sentence, output])
23
 
 
 
 
 
 
 
 
 
 
 
24
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
25
  text = extract_text_from_excel(file)
26
  sentences = text.split('.')
@@ -29,7 +39,7 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
29
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
30
  fieldnames = ['Original Sentence', 'Generated Sentence']
31
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
32
- writer.writeheader()
33
 
34
  for sentence in sentences:
35
  sentence = sentence.strip()
@@ -55,7 +65,9 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
55
  generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
56
 
57
  for generated_sentence in generated_sentences:
58
- writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
 
 
59
 
60
  except Exception as e:
61
  print(f"Error generating data for sentence '{sentence}': {e}")
 
8
  import io
9
  import tempfile
10
  import re
11
+ from transformers import MarianMTModel, MarianTokenizer
12
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
13
 
14
  def extract_text_from_excel(file):
 
21
  writer = csv.writer(file)
22
  writer.writerow([sentence, output])
23
 
24
+
25
+
26
+ def translate_english_to_azerbaijani(text):
27
+ model_name = 'Helsinki-NLP/opus-mt-en-az'
28
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
29
+ model = MarianMTModel.from_pretrained(model_name)
30
+ translated = model.generate(**tokenizer.prepare_translation_batch([text]))
31
+ translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
32
+ return translated_text
33
+
34
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
35
  text = extract_text_from_excel(file)
36
  sentences = text.split('.')
 
39
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
40
  fieldnames = ['Original Sentence', 'Generated Sentence']
41
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
42
+ writer.writeheader()
43
 
44
  for sentence in sentences:
45
  sentence = sentence.strip()
 
65
  generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
66
 
67
  for generated_sentence in generated_sentences:
68
+ translated_original = translate_english_to_azerbaijani(sentence)
69
+ translated_generated = translate_english_to_azerbaijani(generated_sentence)
70
+ writer.writerow({'Original Sentence': translated_original, 'Generated Sentence': translated_generated})
71
 
72
  except Exception as e:
73
  print(f"Error generating data for sentence '{sentence}': {e}")