lukecq commited on
Commit
9d1731e
·
verified ·
1 Parent(s): 531980d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -156
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import time
 
3
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
4
  from io import BytesIO
5
  from urllib.request import urlopen
@@ -7,56 +8,47 @@ import librosa
7
  import os, json
8
  from sys import argv
9
  from vllm import LLM, SamplingParams
 
 
 
 
 
 
 
 
 
10
 
11
- # def load_model_processor(model_path):
12
- # processor = AutoProcessor.from_pretrained(model_path)
13
- # llm = LLM(
14
- # model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8,
15
- # enforce_eager=True, device = "cuda",
16
- # limit_mm_per_prompt={"audio": 5},
17
- # )
18
- # return llm, processor
19
 
20
  def load_model_processor(model_path):
21
  processor = AutoProcessor.from_pretrained(model_path)
22
- model = Qwen2AudioForConditionalGeneration.from_pretrained(model_path, device_map="auto")
23
- model_name = model_path.split("/")[-1]
24
- return model, processor, model_name
 
 
 
25
 
26
- model_path1 = "Qwen/Qwen2-Audio-7B-Instruct" #argv[1]
27
  model1, processor1 = load_model_processor(model_path1)
28
 
29
- # def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
30
- # max_new_tokens = 2048):
31
- # text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
32
- # audios = []
33
- # for message in conversation:
34
- # if isinstance(message["content"], list):
35
- # for ele in message["content"]:
36
- # if ele["type"] == "audio":
37
- # if ele['audio_url'] != None:
38
- # audios.append(librosa.load(
39
- # ele['audio_url'],
40
- # sr=processor.feature_extractor.sampling_rate)[0]
41
- # )
42
-
43
- # sampling_params = SamplingParams(
44
- # temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20,
45
- # stop_token_ids=[],
46
- # )
47
-
48
- # input = {
49
- # 'prompt': text,
50
- # 'multi_modal_data': {
51
- # 'audio': [(audio, 16000) for audio in audios]
52
- # }
53
- # }
54
-
55
- # output = model.generate([input], sampling_params=sampling_params)[0]
56
- # response = output.outputs[0].text
57
- # return response
58
-
59
- def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,max_new_tokens = 2048):
60
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
61
  audios = []
62
  for message in conversation:
@@ -68,103 +60,49 @@ def response_to_audio_conv(conversation, model=None, processor=None, temperature
68
  ele['audio_url'],
69
  sr=processor.feature_extractor.sampling_rate)[0]
70
  )
71
- if audios != []:
72
- inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True,sampling_rate=16000)
73
- else:
74
- inputs = processor(text=text, return_tensors="pt", padding=True)
75
- inputs.input_ids = inputs.input_ids.to("cuda")
76
- inputs = {k: v.to("cuda") for k, v in inputs.items() if v is not None}
77
- generate_ids = model.generate(**inputs, max_new_tokens=2048, temperature = 0.3, do_sample=True)
78
- generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
79
- response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
 
 
 
 
80
  return response
81
 
82
- def print_like_dislike(x: gr.LikeData):
83
- print(x.index, x.value, x.liked)
84
-
85
- def add_message(history, message):
86
- paths = []
87
- for turn in history:
88
- if turn['role'] == "user" and type(turn['content']) != str:
89
- paths.append(turn['content'][0])
90
- for x in message["files"]:
91
- if x not in paths:
92
- history.append({"role": "user", "content": {"path": x}})
93
- if message["text"] is not None:
94
- history.append({"role": "user", "content": message["text"]})
95
- return history, gr.MultimodalTextbox(value=None, interactive=False)
96
-
97
- def format_user_messgae(message):
98
- if type(message['content']) == str:
99
- return {"role": "user", "content": [{"type": "text", "text": message['content']}]}
100
- else:
101
- return {"role": "user", "content": [{"type": "audio", "audio_url": message['content'][0]}]}
102
-
103
- def history_to_conversation(history):
104
- conversation = []
105
- audio_paths = []
106
- for turn in history:
107
- if turn['role'] == "user":
108
- if not turn['content']:
109
- continue
110
- turn = format_user_messgae(turn)
111
- if turn['content'][0]['type'] == 'audio':
112
- if turn['content'][0]['audio_url'] in audio_paths:
113
- continue
114
- else:
115
- audio_paths.append(turn['content'][0]['audio_url'])
116
-
117
- if len(conversation) > 0 and conversation[-1]["role"] == "user":
118
- conversation[-1]['content'].append(turn['content'][0])
119
- else:
120
- conversation.append(turn)
121
- else:
122
- conversation.append(turn)
123
-
124
- print(json.dumps(conversation, indent=4, ensure_ascii=False))
125
- return conversation
126
-
127
- def bot(history: list, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
128
- max_new_tokens = 2048):
129
- conversation = history_to_conversation(history)
130
- response = response_to_audio_conv(conversation, model=model1, processor=processor1, temperature = temperature,repetition_penalty=repetition_penalty, top_p = top_p, max_new_tokens = max_new_tokens)
131
- # response = "Nice to meet you!"
132
- print("Bot:",response)
133
-
134
- history.append({"role": "assistant", "content": ""})
135
- for character in response:
136
- history[-1]["content"] += character
137
- time.sleep(0.01)
138
- yield history
139
-
140
- insturctions = """**Instruction**: there are three input format:
141
- 1. text: input text message only
142
- 2. audio: upload audio file or record a voice message
143
- 3. audio + text: record a voice message and input text message"""
144
 
145
  with gr.Blocks() as demo:
 
146
  # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""")
147
  # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False)
 
 
148
  gr.Markdown(
149
- """<div style="text-align: center; font-size: 32px; font-weight: bold;">SeaLLMs-Audio ChatBot</div>""",
150
- )
151
-
152
- # Description text
153
- gr.Markdown(
154
- """<div style="text-align: center; font-size: 16px;">
155
- This WebUI is based on SeaLLMs-Audio-7B-Chat, developed by Alibaba DAMO Academy.<br>
156
  You can interact with the chatbot in <b>English, Chinese, Indonesian, Thai, or Vietnamese</b>.<br>
157
- For each round, you can input <b>audio and/or text</b>.
158
- </div>""",
159
- )
160
 
161
  # Links with proper formatting
162
  gr.Markdown(
163
- """<div style="text-align: center; font-size: 16px;">
164
  <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Website]</a> &nbsp;
165
  <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Model🤗]</a> &nbsp;
166
  <a href="https://github.com/liuchaoqun/SeaLLMs-Audio">[Github]</a>
167
- </div>""",
168
  )
169
 
170
  # gr.Markdown(insturctions)
@@ -175,36 +113,43 @@ with gr.Blocks() as demo:
175
  # top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P")
176
  # with gr.Column():
177
  # repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty")
178
- chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages")
179
-
180
- chat_input = gr.MultimodalTextbox(
181
- interactive=True,
182
- file_count="single",
183
- file_types=['.wav'],
184
- placeholder="Enter message (optional) ...",
185
- show_label=False,
186
- sources=["microphone", "upload"],
 
 
 
 
 
 
 
 
 
 
 
187
  )
188
 
189
- chat_msg = chat_input.submit(
190
- add_message, [chatbot, chat_input], [chatbot, chat_input]
 
 
 
191
  )
192
- bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
193
- # bot_msg = chat_msg.then(bot, [chatbot, temperature, repetition_penalty, top_p], chatbot, api_name="bot_response")
194
- bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
195
-
196
- # chatbot.like(print_like_dislike, None, None, like_user_message=True)
197
-
198
- clear_button = gr.ClearButton([chatbot, chat_input])
199
-
200
- # PORT = 7950
201
- # demo.launch(server_port=PORT, show_api = True, allowed_paths = [],
202
- # root_path = f"https://dsw-gateway.alibaba-inc.com/dsw81322/proxy/{PORT}/")
203
-
204
- demo.launch(
205
- share=False,
206
- inbrowser=True,
207
- server_port=7950,
208
- server_name="0.0.0.0",
209
- max_threads=40
210
- )
 
1
  import gradio as gr
2
  import time
3
+ import transformers
4
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
5
  from io import BytesIO
6
  from urllib.request import urlopen
 
8
  import os, json
9
  from sys import argv
10
  from vllm import LLM, SamplingParams
11
+ import vllm
12
+
13
+ from huggingface_hub import login
14
+ TOKEN = os.environ.get("TOKEN", None)
15
+ login(token=TOKEN)
16
+
17
+ print("transformers version:", transformers.__version__)
18
+ print("vllm version:", vllm.__version__)
19
+ print("gradio version:", gr.__version__)
20
 
 
 
 
 
 
 
 
 
21
 
22
  def load_model_processor(model_path):
23
  processor = AutoProcessor.from_pretrained(model_path)
24
+ llm = LLM(
25
+ model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8,
26
+ enforce_eager=True, device = "cuda",
27
+ limit_mm_per_prompt={"audio": 5},
28
+ )
29
+ return llm, processor
30
 
31
+ model_path1 = "SeaLLMs/SeaLLMs-Audio-7B"
32
  model1, processor1 = load_model_processor(model_path1)
33
 
34
+ def response_to_audio(audio_url, text, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,max_new_tokens = 2048):
35
+ if text == None:
36
+ conversation = [
37
+ {"role": "user", "content": [
38
+ {"type": "audio", "audio_url": audio_url},
39
+ ]},]
40
+ elif audio_url == None:
41
+ conversation = [
42
+ {"role": "user", "content": [
43
+ {"type": "text", "text": text},
44
+ ]},]
45
+ else:
46
+ conversation = [
47
+ {"role": "user", "content": [
48
+ {"type": "audio", "audio_url": audio_url},
49
+ {"type": "text", "text": text},
50
+ ]},]
51
+
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
53
  audios = []
54
  for message in conversation:
 
60
  ele['audio_url'],
61
  sr=processor.feature_extractor.sampling_rate)[0]
62
  )
63
+
64
+ sampling_params = SamplingParams(
65
+ temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20,
66
+ stop_token_ids=[],
67
+ )
68
+
69
+ input = {
70
+ 'prompt': text,
71
+ 'multi_modal_data': {
72
+ 'audio': [(audio, 16000) for audio in audios]
73
+ }
74
+ }
75
+
76
+ output = model.generate([input], sampling_params=sampling_params)[0]
77
+ response = output.outputs[0].text
78
  return response
79
 
80
+ def clear_inputs():
81
+ return None, "", ""
82
+
83
+ def compare_responses(audio_url, text):
84
+ response1 = response_to_audio(audio_url, text, model1, processor1)
85
+ return response1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  with gr.Blocks() as demo:
88
+ # gr.Markdown(f"Evaluate {model_path1}")
89
  # gr.Markdown("""<p align="center"><img src="images/seal_logo.png" style="height: 80px"/><p>""")
90
  # gr.Image("images/seal_logo.png", elem_id="seal_logo", show_label=False,height=80,show_fullscreen_button=False)
91
+ # gr.Markdown("""<center><font size=8>SeaLLMs-Audio Demo</center>""")
92
+ gr.Markdown("""# SeaLLMs-Audio Demo""")
93
  gr.Markdown(
94
+ """\
95
+ <center><font size=4>This WebUI is based on SeaLLMs-Audio-7B-Chat, developed by Alibaba DAMO Academy.<br>
 
 
 
 
 
96
  You can interact with the chatbot in <b>English, Chinese, Indonesian, Thai, or Vietnamese</b>.<br>
97
+ For the input, you can input <b>audio and/or text</center>.""")
 
 
98
 
99
  # Links with proper formatting
100
  gr.Markdown(
101
+ """<center><font size=4>
102
  <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Website]</a> &nbsp;
103
  <a href="https://huggingface.co/SeaLLMs/SeaLLMs-v3-7B-Chat">[Model🤗]</a> &nbsp;
104
  <a href="https://github.com/liuchaoqun/SeaLLMs-Audio">[Github]</a>
105
+ </center>""",
106
  )
107
 
108
  # gr.Markdown(insturctions)
 
113
  # top_p = gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.1, label="Top P")
114
  # with gr.Column():
115
  # repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.1, step=0.1, label="Repetition Penalty")
116
+
117
+ with gr.Row():
118
+ with gr.Column():
119
+ # mic_input = gr.Microphone(label="Record Audio", type="filepath", elem_id="mic_input")
120
+ mic_input = gr.Audio(sources = ['upload', 'microphone'], label="Record Audio", type="filepath", elem_id="mic_input")
121
+ with gr.Column():
122
+ additional_input = gr.Textbox(label="Text Input")
123
+
124
+ # Button to trigger the function
125
+ with gr.Row():
126
+ btn_submit = gr.Button("Submit")
127
+ btn_clear = gr.Button("Clear")
128
+
129
+ with gr.Row():
130
+ output_text1 = gr.Textbox(label=model_path1.split('/')[-1], interactive=False, elem_id="output_text1")
131
+
132
+ btn_submit.click(
133
+ fn=compare_responses,
134
+ inputs=[mic_input, additional_input],
135
+ outputs=[output_text1],
136
  )
137
 
138
+ btn_clear.click(
139
+ fn=clear_inputs,
140
+ inputs=None,
141
+ outputs=[mic_input, additional_input, output_text1],
142
+ queue=False,
143
  )
144
+
145
+
146
+ # demo.launch(
147
+ # share=False,
148
+ # inbrowser=True,
149
+ # server_port=7950,
150
+ # server_name="0.0.0.0",
151
+ # max_threads=40
152
+ # )
153
+
154
+ demo.launch(share=True)
155
+ demo.queue(default_concurrency_limit=40).launch(share=True)