ramalMr commited on
Commit
d04019f
·
verified ·
1 Parent(s): 0c82cc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -86
app.py CHANGED
@@ -2,83 +2,43 @@ from huggingface_hub import InferenceClient
2
  import gradio as gr
3
  import random
4
  import pandas as pd
5
- from io import BytesIO
6
  import csv
7
  import os
8
- import io
9
  import tempfile
10
  import re
11
- import streamlit as st
12
- import torch
13
  from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
14
- import time
15
- import logging
16
-
17
- if torch.cuda.is_available():
18
- device = torch.device("cuda:0")
19
- else:
20
- device = torch.device("cpu")
21
- logging.warning("GPU not found, using CPU, translation will be very slow.")
22
 
23
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
24
 
25
- lang_id = {
26
- "Afrikaans": "af",
27
- "Amharic": "am",
28
- "Arabic": "ar",
29
- "Asturian": "ast",
30
- "Azerbaijani": "az",
31
- "Bashkir": "ba",
32
- "Belarusian": "be",
33
- "Bulgarian": "bg",
34
- "Bengali": "bn",
35
- "Breton": "br",
36
- "Bosnian": "bs",
37
- "Catalan": "ca",
38
- "Cebuano": "ceb",
39
- "Czech": "cs",
40
- "Welsh": "cy",
41
- "Danish": "da",
42
- "German": "de",
43
- "Greeek": "el",
44
- "English": "en",
45
- "Spanish": "es",
46
- "Estonian": "et",
47
- "Persian": "fa",
48
- "Fulah": "ff",
49
- "Finnish": "fi",
50
- "French": "fr",
51
- "Western Frisian": "fy",
52
- "Irish": "ga",
53
- "Gaelic": "gd",
54
- "Galician": "gl",
55
- "Gujarati": "gu",
56
- "Hausa": "ha",
57
- "Hebrew": "he",
58
- "Hindi": "hi",
59
- "Croatian": "hr",
60
- "Haitian": "ht",
61
- "Hungarian": "hu",
62
- "Armenian": "hy",
63
- "Indonesian": "id"
64
- }
65
-
66
- @st.cache(suppress_st_warning=True, allow_output_mutation=True)
67
- def load_model(pretrained_model: str = "facebook/m2m100_1.2B", cache_dir: str = "models/"):
68
- tokenizer = M2M100Tokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir)
69
- model = M2M100ForConditionalGeneration.from_pretrained(pretrained_model, cache_dir=cache_dir).to(device)
70
- model.eval()
71
- return tokenizer, model
72
 
73
  def extract_text_from_excel(file):
74
  df = pd.read_excel(file)
75
  text = ' '.join(df['Unnamed: 1'].astype(str))
76
- return text
 
 
77
 
78
  def save_to_csv(sentence, output, filename="synthetic_data.csv"):
 
79
  with open(filename, mode='a', newline='', encoding='utf-8') as file:
80
  writer = csv.writer(file)
81
- writer.writerow([sentence, output])
82
 
83
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
84
  text = extract_text_from_excel(file)
@@ -88,7 +48,7 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_s
88
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
89
  fieldnames = ['Original Sentence', 'Generated Sentence']
90
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
91
- writer.writeheader()
92
 
93
  for sentence in sentences:
94
  sentence = sentence.strip()
@@ -117,28 +77,7 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_s
117
  if not generated_sentences:
118
  break
119
  generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
120
-
121
- # Translate generated sentence to English
122
- tokenizer, model = load_model()
123
- src_lang = lang_id[language]
124
- trg_lang = lang_id["English"]
125
- tokenizer.src_lang = src_lang
126
- with torch.no_grad():
127
- encoded_input = tokenizer(generated_sentence, return_tensors="pt").to(device)
128
- generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang))
129
- translated_sentence = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
130
-
131
- # Translate original sentence to Azerbaijani
132
- tokenizer, model = load_model()
133
- src_lang = lang_id["English"]
134
- trg_lang = lang_id["Azerbaijani"]
135
- tokenizer.src_lang = src_lang
136
- with torch.no_grad():
137
- encoded_input = tokenizer(sentence, return_tensors="pt").to(device)
138
- generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang))
139
- translated_sentence_az = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
140
-
141
- writer.writerow({'Original Sentence': translated_sentence_az, 'Generated Sentence': translated_sentence})
142
 
143
  except Exception as e:
144
  print(f"Error generating data for sentence '{sentence}': {e}")
@@ -147,7 +86,7 @@ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_s
147
 
148
  return tmp_path
149
 
150
- gr.Interface(
151
  fn=generate,
152
  inputs=[
153
  gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
@@ -156,7 +95,6 @@ gr.Interface(
156
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
157
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
158
  gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
159
- gr.Dropdown(label="Language of the input data", choices=list(lang_id.keys()), value="English")
160
  ],
161
  outputs=gr.File(label="Synthetic Data "),
162
  title="SDG",
 
2
  import gradio as gr
3
  import random
4
  import pandas as pd
5
+ from io import BytesIO
6
  import csv
7
  import os
8
+ import io
9
  import tempfile
10
  import re
 
 
11
  from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
 
 
 
 
 
 
 
 
12
 
13
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
14
 
15
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
16
+ model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
17
+
18
+ def translate_to_english(text, source_lang):
19
+ encoded_input = tokenizer(text, return_tensors="pt")
20
+ generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id("en"))
21
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
22
+ return translated_text
23
+
24
+ def translate_to_azerbaijani(text):
25
+ encoded_input = tokenizer(text, return_tensors="pt")
26
+ generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id("az"))
27
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
28
+ return translated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def extract_text_from_excel(file):
31
  df = pd.read_excel(file)
32
  text = ' '.join(df['Unnamed: 1'].astype(str))
33
+ source_lang = "az" # Azerbaijani
34
+ english_text = translate_to_english(text, source_lang)
35
+ return english_text
36
 
37
  def save_to_csv(sentence, output, filename="synthetic_data.csv"):
38
+ azerbaijani_output = translate_to_azerbaijani(output)
39
  with open(filename, mode='a', newline='', encoding='utf-8') as file:
40
  writer = csv.writer(file)
41
+ writer.writerow([sentence, azerbaijani_output])
42
 
43
  def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
44
  text = extract_text_from_excel(file)
 
48
  with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
49
  fieldnames = ['Original Sentence', 'Generated Sentence']
50
  writer = csv.DictWriter(tmp, fieldnames=fieldnames)
51
+ writer.writeheader()
52
 
53
  for sentence in sentences:
54
  sentence = sentence.strip()
 
77
  if not generated_sentences:
78
  break
79
  generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
80
+ writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  except Exception as e:
83
  print(f"Error generating data for sentence '{sentence}': {e}")
 
86
 
87
  return tmp_path
88
 
89
+ gr.Interface(
90
  fn=generate,
91
  inputs=[
92
  gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
 
95
  gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
96
  gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
97
  gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
 
98
  ],
99
  outputs=gr.File(label="Synthetic Data "),
100
  title="SDG",