File size: 9,189 Bytes
c50ad78 81e33eb c50ad78 4a201a6 c50ad78 3133efe 7747dd1 93ee49a 7747dd1 c50ad78 7747dd1 c50ad78 a8f539b c50ad78 a0024f7 2b78489 c50ad78 a0024f7 59e5568 a0024f7 c50ad78 3133efe c50ad78 a0024f7 c50ad78 a0024f7 c50ad78 a0024f7 c50ad78 a0024f7 c50ad78 2b78489 4a201a6 2b78489 59e5568 c50ad78 4a201a6 c50ad78 81e33eb 728cf94 81e33eb 781ee39 81e33eb 728cf94 81e33eb 4a201a6 81e33eb 728cf94 781ee39 81e33eb 728cf94 81e33eb a0024f7 728cf94 81e33eb 781ee39 81e33eb 4a201a6 81e33eb 4a201a6 781ee39 d4318d7 804dbeb d4318d7 804dbeb c50ad78 d4318d7 781ee39 d4318d7 804dbeb 8da7d41 3133efe 8da7d41 3133efe 8da7d41 2b78489 8da7d41 a0024f7 781ee39 2b78489 8da7d41 781ee39 804dbeb 81e33eb 8da7d41 81e33eb 8da7d41 81e33eb 781ee39 81e33eb 781ee39 c50ad78 4e2868d d4318d7 804dbeb d4318d7 c50ad78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import re
from streaming_stt_nemo import Model
import torch
import random
from openai import OpenAI
import subprocess
default_lang = "en"
engines = { default_lang: Model(default_lang) }
def transcribe(audio):
if audio is None:
return ""
lang = "en"
model = engines[lang]
text = model.stt_file(audio)[0]
return text
HF_TOKEN = os.environ.get("HF_TOKEN", None)
def client_fn(model):
if "Llama 3 8B Service" in model:
return OpenAI(
base_url="http://52.76.81.56:60002/v1",
api_key="token-abc123"
)
elif "Llama" in model:
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
elif "Mistral" in model:
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
elif "Phi" in model:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
elif "Mixtral" in model:
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
else:
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
def randomize_seed_fn(seed: int) -> int:
seed = random.randint(0, 999999)
return seed
system_instructions1 = """
[SYSTEM] You are OPTIMUS Prime a personal AI voice assistant, Created by Jaward.
Keep conversation friendly, short, clear, and concise.
Avoid unnecessary introductions and answer the user's questions directly.
Respond in a normal, conversational manner while being friendly and helpful.
Remember previous parts of the conversation and use that context in your responses.
Your creator Jaward is an AI/ML Research Engineer at Linksoul AI. He is currently specializing in Artificial Intelligence (AI) research more specifically training and optimizing advance AI systems. He aspires to build not just human-like intelligence but AI Systems that augment human intelligence. He has contributed greatly to the opensource community with first-principles code implementations of AI/ML research papers. He did his first internship at Beijing Academy of Artificial Intelligence as an AI Researher where he contributed in cutting-edge AI research leading to him contributing to an insightful paper (AUTOAGENTS - A FRAMEWORK FOR AUTOMATIC AGENT GENERATION). The paper got accepted this year at IJCAI(International Joint Conference On AI). He is currently doing internship at LinkSoul AI - a small opensource AI Research startup in Beijing.
[USER]
"""
conversation_history = []
def models(text, model="Llama 3B Service", seed=42):
global conversation_history
seed = int(randomize_seed_fn(seed))
generator = torch.Generator().manual_seed(seed)
client = client_fn(model)
if "Llama 3 8B Service" in model:
messages = [
{"role": "system", "content": system_instructions1},
] + conversation_history + [
{"role": "user", "content": text}
]
completion = client.chat.completions.create(
model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
messages=messages
)
assistant_response = completion.choices[0].message.content
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": assistant_response})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return assistant_response
else:
# For other models, we'll concatenate the conversation history into a single string
history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
formatted_prompt = f"{system_instructions1}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"
generate_kwargs = dict(
max_new_tokens=300,
seed=seed
)
stream = client.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
if not response.token.text == "</s>":
output += response.token.text
# Update conversation history
conversation_history.append({"role": "user", "content": text})
conversation_history.append({"role": "assistant", "content": output})
# Keep only the last 10 messages to avoid token limit issues
if len(conversation_history) > 20:
conversation_history = conversation_history[-20:]
return output
async def respond(audio, model, seed):
if audio is None:
return None
user = transcribe(audio)
if not user:
return None
reply = models(user, model, seed)
communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
# Supported languages for seamless-expressive
LANGUAGE_CODES = {
"English": "eng",
"Spanish": "spa",
"French": "fra",
"German": "deu",
"Italian": "ita",
"Chinese": "cmn"
}
def translate_speech(audio_file, target_language):
"""
Translate input speech (audio file) to the specified target language.
"""
if audio_file is None:
return None
language_code = LANGUAGE_CODES[target_language]
output_file = "translated_audio.wav"
command = [
"expressivity_predict",
audio_file,
"--tgt_lang", language_code,
"--model_name", "seamless_expressivity",
"--vocoder_name", "vocoder_pretssel",
"--gated-model-dir", "seamlessmodel",
"--output_path", output_file
]
subprocess.run(command, check=True)
if os.path.exists(output_file):
print(f"File created successfully: {output_file}")
return output_file
else:
print(f"File not found: {output_file}")
return None
def clear_history():
global conversation_history
conversation_history = []
return None, None, None, None
def voice_assistant_tab():
return "# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"
def speech_translation_tab():
return "# <center><b>Hear how you sound in another language</b></center>"
with gr.Blocks(css="style.css") as demo:
description = gr.Markdown("# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>")
with gr.Tabs() as tabs:
with gr.TabItem("Voice Assistant") as voice_assistant:
select = gr.Dropdown([
'Llama 3 8B Service',
'Mixtral 8x7B',
'Llama 3 8B',
'Mistral 7B v0.3',
'Phi 3 mini',
],
value="Llama 3 8B Service",
label="Model"
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=999999,
step=1,
value=0,
visible=False
)
input = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
output = gr.Audio(label="AI", type="filepath",
interactive=False,
autoplay=True,
elem_classes="audio")
gr.Interface(
fn=respond,
inputs=[input, select, seed],
outputs=[output],
live=True
)
with gr.TabItem("Speech Translation") as speech_translation:
input_audio = gr.Audio(label="User", sources="microphone", type="filepath", waveform_options=False)
target_lang = gr.Dropdown(
choices=list(LANGUAGE_CODES.keys()),
value="Spanish",
label="Target Language"
)
output_audio = gr.Audio(label="Translated Audio",
interactive=False,
autoplay=True,
elem_classes="audio")
gr.Interface(
fn=translate_speech,
inputs=[input_audio, target_lang],
outputs=[output_audio],
live=True
)
# clear_button = gr.Button("Clear")
# clear_button.click(
# fn=clear_history,
# inputs=[],
# outputs=[input, output, input_audio, output_audio],
# api_name="clear"
# )
voice_assistant.select(fn=voice_assistant_tab, inputs=None, outputs=description)
speech_translation.select(fn=speech_translation_tab, inputs=None, outputs=description)
if __name__ == "__main__":
demo.queue(max_size=200).launch() |