naofunyannn commited on
Commit
b27c67e
·
verified ·
1 Parent(s): c0892a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -240
app.py CHANGED
@@ -1,241 +1,241 @@
1
- import torch
2
- import gradio as gr
3
- import spaces
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
- import os
6
- import re
7
- from polyglot.detect import Detector
8
- from nltk.translate.bleu_score import sentence_bleu
9
-
10
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
- MODEL = "LLaMAX/LLaMAX3-8B-Alpaca"
12
- RELATIVE_MODEL="LLaMAX/LLaMAX3-8B"
13
-
14
- TITLE = "<h1><center>LLaMAX Translator</center></h1>"
15
-
16
-
17
- model = AutoModelForCausalLM.from_pretrained(
18
- MODEL,
19
- torch_dtype=torch.float16,
20
- device_map="auto")
21
- tokenizer = AutoTokenizer.from_pretrained(MODEL)
22
-
23
-
24
- def lang_detector(text):
25
- min_chars = 5
26
- if len(text) < min_chars:
27
- return "Input text too short"
28
- try:
29
- detector = Detector(text).language
30
- lang_info = str(detector)
31
- code = re.search(r"name: (\w+)", lang_info).group(1)
32
- return code
33
- except Exception as e:
34
- return f"ERROR:{str(e)}"
35
-
36
- def Prompt_template(inst, prompt, query, src_language, trg_language):
37
- inst = inst.format(src_language=src_language, trg_language=trg_language)
38
- instruction = f"`{inst}`"
39
- prompt = (
40
- f'{prompt}'
41
- f'### Instruction:\n{instruction}\n'
42
- f'### Input:\n{query}\n### Response:'
43
- )
44
- return prompt
45
-
46
- # Unfinished
47
- def chunk_text():
48
- pass
49
-
50
- # Function to calculate BLEU score
51
- def calculate_bleu_score(candidate: str, references: list):
52
- candidate_tokens = candidate.split() # Tokenizing the candidate output
53
- bleu_score = sentence_bleu(references, candidate_tokens) # Calculating BLEU score
54
- return bleu_score
55
-
56
- @spaces.GPU(duration=60)
57
- def translate(
58
- source_text: str,
59
- source_lang: str,
60
- target_lang: str,
61
- inst: str,
62
- prompt: str,
63
- max_length: int,
64
- temperature: float,
65
- top_p: float,
66
- rp: float):
67
-
68
- print(f'Text is - {source_text}')
69
-
70
- prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang)
71
- input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
72
-
73
- generate_kwargs = dict(
74
- input_ids=input_ids,
75
- max_length=max_length,
76
- do_sample=True,
77
- temperature=temperature,
78
- top_p=top_p,
79
- repetition_penalty=rp,
80
- )
81
-
82
- outputs = model.generate(**generate_kwargs)
83
-
84
- resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
85
-
86
- #yield resp[len(prompt):]
87
- # Calculate BLEU score
88
- '''
89
- references = [
90
- 'this is a dog'.split(),
91
- 'it is dog'.split(),
92
- 'dog it is'.split(),
93
- 'a dog, it is'.split()
94
- ]
95
- bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score
96
- '''
97
- references = [resp[len(prompt):].split()] # Use the generated response as the reference
98
- bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score
99
-
100
- yield resp[len(prompt):], bleu_score
101
-
102
- CSS = """
103
- h1 {
104
- text-align: center;
105
- display: block;
106
- height: 10vh;
107
- align-content: center;
108
- font-family: Arial, Helvetica, sans-serif;
109
- }
110
- footer {
111
- visibility: hidden;
112
- }
113
- font-family: Arial, Helvetica, sans-serif;
114
- """
115
-
116
- LICENSE = """
117
- Model: <a href="https://huggingface.co/LLaMAX/LLaMAX3-8B-Alpaca">LLaMAX3-8B-Alpaca</a>
118
- """
119
-
120
- LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \
121
- 'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \
122
- 'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \
123
- 'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \
124
- 'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \
125
- 'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \
126
- 'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \
127
- 'Japanese', 'Javanese', \
128
- 'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \
129
- 'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \
130
- 'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \
131
- 'Nepali', 'Northern', 'Norwegian', 'Nyanja', \
132
- 'Occitan', 'Oriya', 'Oromo', \
133
- 'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \
134
- 'Romanian', 'Russian', \
135
- 'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \
136
- 'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \
137
- 'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \
138
- 'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu']
139
-
140
- chatbot = gr.Chatbot(height=600)
141
-
142
- with gr.Blocks(theme="soft", css=CSS) as demo:
143
- gr.Markdown(TITLE)
144
- with gr.Row():
145
- with gr.Column(scale=4):
146
- source_text = gr.Textbox(
147
- label="Văn bản gốc",
148
- value="LLaMAX is a language model with powerful multilingual capabilities without loss instruction-following capabilities. "+\
149
- "LLaMAX supports translation between more than 100 languages, "+\
150
- "surpassing the performance of similarly scaled LLMs.",
151
- lines=10,
152
- )
153
- output_text = gr.Textbox(
154
- label="Văn bản đã được dịch",
155
- lines=10,
156
- show_copy_button=True,
157
- )
158
-
159
- bleu_score_output = gr.Textbox( # New holder area for BLEU score
160
- label="BLEU Score",
161
- lines=10,
162
- interactive=False,
163
- )
164
-
165
- with gr.Column(scale=1):
166
- source_lang = gr.Dropdown(
167
- label="Ngôn ngữ nguồn",
168
- value="English",
169
- choices=LANG_LIST,
170
- )
171
- target_lang = gr.Dropdown(
172
- label="Ngôn ngữ đích",
173
- value="Vietnamese",
174
- choices=LANG_LIST,
175
- )
176
- max_length = gr.Slider(
177
- label="Độ dài tối đa",
178
- minimum=512,
179
- maximum=8192,
180
- value=4000,
181
- step=8,
182
- )
183
- temperature = gr.Slider(
184
- label="Temperature",
185
- minimum=0,
186
- maximum=1,
187
- value=0.3,
188
- step=0.1,
189
- )
190
- top_p = gr.Slider(
191
- label="top_p",
192
- minimum=0.0,
193
- maximum=1.0,
194
- step=0.1,
195
- value=1.0,
196
- )
197
- rp = gr.Slider(
198
- label="Repetition penalty",
199
- minimum=1.0,
200
- maximum=2.0,
201
- step=0.1,
202
- value=1.2,
203
- )
204
- with gr.Accordion("Tùy chọn nâng cao", open=False):
205
- inst = gr.Textbox(
206
- label="Instruction",
207
- value="Translate the following sentences from {src_language} to {trg_language}.",
208
- lines=3,
209
- )
210
- prompt = gr.Textbox(
211
- label="Prompt",
212
- # Prompt 1
213
- #value="""Below is an instruction that describes a task, paired with an input that provides further context.
214
- #Write a response that appropriately completes the request.
215
- ### Instruction:
216
- #{instruction}
217
- ### Input:
218
- #{query}
219
- ### Response:""",#
220
- # Prompt 2
221
- value="""Below is an instruction that describes a task, paired with an input that provides further context.
222
- Write a response that ensuring accuracy and maintaining the tone and style of the original text.
223
- ### Instruction:
224
- {instruction}
225
- ### Input:
226
- {query}
227
- ### Response:""",
228
- lines=8,
229
- )
230
-
231
- with gr.Row():
232
- submit = gr.Button(value="Submit")
233
- clear = gr.ClearButton([source_text, output_text])
234
- gr.Markdown(LICENSE)
235
-
236
- #source_text.change(lang_detector, source_text, source_lang)
237
- #submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text])
238
- submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output])
239
-
240
- if __name__ == "__main__":
241
  demo.launch()
 
1
+ import torch
2
+ import gradio as gr
3
+ import spaces
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ import os
6
+ import re
7
+ from polyglot.detect import Detector
8
+ from nltk.translate.bleu_score import sentence_bleu
9
+
10
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
+ MODEL = "LLaMAX/LLaMAX2-7B-Alpaca"
12
+ RELATIVE_MODEL="LLaMAX/LLaMAX2-7B"
13
+
14
+ TITLE = "<h1><center>LLaMAX Translator</center></h1>"
15
+
16
+
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ MODEL,
19
+ torch_dtype=torch.float16,
20
+ device_map="auto")
21
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
22
+
23
+
24
+ def lang_detector(text):
25
+ min_chars = 5
26
+ if len(text) < min_chars:
27
+ return "Input text too short"
28
+ try:
29
+ detector = Detector(text).language
30
+ lang_info = str(detector)
31
+ code = re.search(r"name: (\w+)", lang_info).group(1)
32
+ return code
33
+ except Exception as e:
34
+ return f"ERROR:{str(e)}"
35
+
36
+ def Prompt_template(inst, prompt, query, src_language, trg_language):
37
+ inst = inst.format(src_language=src_language, trg_language=trg_language)
38
+ instruction = f"`{inst}`"
39
+ prompt = (
40
+ f'{prompt}'
41
+ f'### Instruction:\n{instruction}\n'
42
+ f'### Input:\n{query}\n### Response:'
43
+ )
44
+ return prompt
45
+
46
+ # Unfinished
47
+ def chunk_text():
48
+ pass
49
+
50
+ # Function to calculate BLEU score
51
+ def calculate_bleu_score(candidate: str, references: list):
52
+ candidate_tokens = candidate.split() # Tokenizing the candidate output
53
+ bleu_score = sentence_bleu(references, candidate_tokens) # Calculating BLEU score
54
+ return bleu_score
55
+
56
+ @spaces.GPU(duration=60)
57
+ def translate(
58
+ source_text: str,
59
+ source_lang: str,
60
+ target_lang: str,
61
+ inst: str,
62
+ prompt: str,
63
+ max_length: int,
64
+ temperature: float,
65
+ top_p: float,
66
+ rp: float):
67
+
68
+ print(f'Text is - {source_text}')
69
+
70
+ prompt = Prompt_template(inst, prompt, source_text, source_lang, target_lang)
71
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
72
+
73
+ generate_kwargs = dict(
74
+ input_ids=input_ids,
75
+ max_length=max_length,
76
+ do_sample=True,
77
+ temperature=temperature,
78
+ top_p=top_p,
79
+ repetition_penalty=rp,
80
+ )
81
+
82
+ outputs = model.generate(**generate_kwargs)
83
+
84
+ resp = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
85
+
86
+ #yield resp[len(prompt):]
87
+ # Calculate BLEU score
88
+ '''
89
+ references = [
90
+ 'this is a dog'.split(),
91
+ 'it is dog'.split(),
92
+ 'dog it is'.split(),
93
+ 'a dog, it is'.split()
94
+ ]
95
+ bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score
96
+ '''
97
+ references = [resp[len(prompt):].split()] # Use the generated response as the reference
98
+ bleu_score = calculate_bleu_score(resp[len(prompt):], references) # Calculate BLEU score
99
+
100
+ yield resp[len(prompt):], bleu_score
101
+
102
+ CSS = """
103
+ h1 {
104
+ text-align: center;
105
+ display: block;
106
+ height: 10vh;
107
+ align-content: center;
108
+ font-family: Arial, Helvetica, sans-serif;
109
+ }
110
+ footer {
111
+ visibility: hidden;
112
+ }
113
+ font-family: Arial, Helvetica, sans-serif;
114
+ """
115
+
116
+ LICENSE = """
117
+ Model: <a href="https://huggingface.co/LLaMAX/LLaMAX3-8B-Alpaca">LLaMAX3-8B-Alpaca</a>
118
+ """
119
+
120
+ LANG_LIST = ['Akrikaans', 'Amharic', 'Arabic', 'Armenian', 'Assamese', 'Asturian', 'Azerbaijani', \
121
+ 'Belarusian', 'Bengali', 'Bosnian', 'Bulgarian', 'Burmese', \
122
+ 'Catalan', 'Cebuano', 'Simplified Chinese', 'Traditional Chinese', 'Croatian', 'Czech', \
123
+ 'Danish', 'Dutch', 'English', 'Estonian', 'Filipino', 'Finnish', 'French', 'Fulah', \
124
+ 'Galician', 'Ganda', 'Georgian', 'German', 'Greek', 'Gujarati', \
125
+ 'Hausa', 'Hebrew', 'Hindi', 'Hungarian', \
126
+ 'Icelandic', 'Igbo', 'Indonesian', 'Irish', 'Italian', \
127
+ 'Japanese', 'Javanese', \
128
+ 'Kabuverdianu', 'Kamba', 'Kannada', 'Kazakh', 'Khmer', 'Korean', 'Kyrgyz', \
129
+ 'Lao', 'Latvian', 'Lingala', 'Lithuanian', 'Luo', 'Luxembourgish', \
130
+ 'Macedonian', 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Mongolian', \
131
+ 'Nepali', 'Northern', 'Norwegian', 'Nyanja', \
132
+ 'Occitan', 'Oriya', 'Oromo', \
133
+ 'Pashto', 'Persian', 'Polish', 'Portuguese', 'Punjabi', \
134
+ 'Romanian', 'Russian', \
135
+ 'Serbian', 'Shona', 'Sindhi', 'Slovak', 'Slovenian', 'Somali', 'Sorani', 'Spanish', 'Swahili', 'Swedish', \
136
+ 'Tajik', 'Tamil', 'Telugu', 'Thai', 'Turkish', \
137
+ 'Ukrainian', 'Umbundu', 'Urdu', 'Uzbek', \
138
+ 'Vietnamese', 'Welsh', 'Wolof', 'Xhosa', 'Yoruba', 'Zulu']
139
+
140
+ chatbot = gr.Chatbot(height=600)
141
+
142
+ with gr.Blocks(theme="soft", css=CSS) as demo:
143
+ gr.Markdown(TITLE)
144
+ with gr.Row():
145
+ with gr.Column(scale=4):
146
+ source_text = gr.Textbox(
147
+ label="Văn bản gốc",
148
+ value="LLaMAX is a language model with powerful multilingual capabilities without loss instruction-following capabilities. "+\
149
+ "LLaMAX supports translation between more than 100 languages, "+\
150
+ "surpassing the performance of similarly scaled LLMs.",
151
+ lines=10,
152
+ )
153
+ output_text = gr.Textbox(
154
+ label="Văn bản đã được dịch",
155
+ lines=10,
156
+ show_copy_button=True,
157
+ )
158
+
159
+ bleu_score_output = gr.Textbox( # New holder area for BLEU score
160
+ label="BLEU Score",
161
+ lines=10,
162
+ interactive=False,
163
+ )
164
+
165
+ with gr.Column(scale=1):
166
+ source_lang = gr.Dropdown(
167
+ label="Ngôn ngữ nguồn",
168
+ value="English",
169
+ choices=LANG_LIST,
170
+ )
171
+ target_lang = gr.Dropdown(
172
+ label="Ngôn ngữ đích",
173
+ value="Vietnamese",
174
+ choices=LANG_LIST,
175
+ )
176
+ max_length = gr.Slider(
177
+ label="Độ dài tối đa",
178
+ minimum=512,
179
+ maximum=8192,
180
+ value=4000,
181
+ step=8,
182
+ )
183
+ temperature = gr.Slider(
184
+ label="Temperature",
185
+ minimum=0,
186
+ maximum=1,
187
+ value=0.3,
188
+ step=0.1,
189
+ )
190
+ top_p = gr.Slider(
191
+ label="top_p",
192
+ minimum=0.0,
193
+ maximum=1.0,
194
+ step=0.1,
195
+ value=1.0,
196
+ )
197
+ rp = gr.Slider(
198
+ label="Repetition penalty",
199
+ minimum=1.0,
200
+ maximum=2.0,
201
+ step=0.1,
202
+ value=1.2,
203
+ )
204
+ with gr.Accordion("Tùy chọn nâng cao", open=False):
205
+ inst = gr.Textbox(
206
+ label="Instruction",
207
+ value="Translate the following sentences from {src_language} to {trg_language}.",
208
+ lines=3,
209
+ )
210
+ prompt = gr.Textbox(
211
+ label="Prompt",
212
+ # Prompt 1
213
+ #value="""Below is an instruction that describes a task, paired with an input that provides further context.
214
+ #Write a response that appropriately completes the request.
215
+ ### Instruction:
216
+ #{instruction}
217
+ ### Input:
218
+ #{query}
219
+ ### Response:""",#
220
+ # Prompt 2
221
+ value="""Below is an instruction that describes a task, paired with an input that provides further context.
222
+ Write a response that ensuring accuracy and maintaining the tone and style of the original text.
223
+ ### Instruction:
224
+ {instruction}
225
+ ### Input:
226
+ {query}
227
+ ### Response:""",
228
+ lines=8,
229
+ )
230
+
231
+ with gr.Row():
232
+ submit = gr.Button(value="Submit")
233
+ clear = gr.ClearButton([source_text, output_text])
234
+ gr.Markdown(LICENSE)
235
+
236
+ #source_text.change(lang_detector, source_text, source_lang)
237
+ #submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text])
238
+ submit.click(fn=translate, inputs=[source_text, source_lang, target_lang, inst, prompt, max_length, temperature, top_p, rp], outputs=[output_text, bleu_score_output])
239
+
240
+ if __name__ == "__main__":
241
  demo.launch()