Spaces:
Sleeping
Sleeping
File size: 4,653 Bytes
c46a1db 84d5910 fcc6e0c c46a1db fcc6e0c 7287921 84d5910 c46a1db 7287921 84d5910 c46a1db fb059e3 fcc6e0c 84d5910 c46a1db 1a40aec c46a1db 1a40aec 3fee0c7 106b63b 3fee0c7 1a40aec 163c158 3fee0c7 163c158 3fee0c7 84d5910 c46a1db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import torch
import spaces
import gradio as gr
from threading import Thread
import re
import time
import tempfile
import os
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, TextIteratorStreamer
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model.to("cuda:0")
ASR_MODEL_NAME = "openai/whisper-large-v3"
ASR_BATCH_SIZE = 8
ASR_CHUNK_LENGTH_S = 30
TEMP_FILE_LIMIT_MB = 1000
from huggingface_hub import InferenceClient
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
device = 0 if torch.cuda.is_available() else "cpu"
asr_pl = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
chunk_length_s=ASR_CHUNK_LENGTH_S,
device=device,
)
application_title = "Enlight Innovations Limited -- Demo"
application_description = "This demo is desgined to illustrate our basic idea and feasibility in implementation."
@spaces.GPU
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for message in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message.choices[0].delta.content
response += token
yield response
@spaces.GPU
def transcribe(inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = asr_pl(asr_inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
audio_input = gr.Audio(sources="microphone", type="filepath", label="Audio: from microphone") #gr.Audio(sources="upload", type="filepath", label="Audio: from file")
audio_input_choice = gr.Radio(["audio file", "microphone"], label="Audio", value="audio file")
task_input_choice = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
transcribe_interface = gr.Interface(
fn=transcribe,
inputs=[
audio_input,
audio_input_choice,
task_input_choice,
],
outputs="text",
title=application_title,
description=application_description,
allow_flagging="never",
)
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
chatbot_sys_output = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
chatbot_max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
chatbot_temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
chatbot_top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
)
chat_interface = gr.ChatInterface(
respond,
title=application_title,
description=application_description,
additional_inputs=[
chatbot_sys_output,
chatbot_max_tokens,
chatbot_temperature,
chatbot_top_p,
],
)
with gr.Blocks() as demo:
gr.TabbedInterface([transcribe_interface, chat_interface], ["Step 1: Transcribe", "Step 2: Extract"])
def update_audio_input(audio_input_choice):
if audio_input_choice == "audio file":
return gr.Audio(sources="upload", type="filepath", label="Audio: from file")
elif audio_input_choice == "microphone":
return gr.Audio(sources="microphone", type="filepath", label="Audio: from microphone")
audio_input_choice.input(fn=update_audio_input,
inputs=audio_input_choice,
outputs=audio_input
)
if __name__ == "__main__":
demo.queue().launch() #demo.launch() |