Spaces:
Sleeping
Sleeping
File size: 5,022 Bytes
c46a1db 84d5910 fcc6e0c c46a1db fcc6e0c 7287921 84d5910 c46a1db 7287921 84d5910 c46a1db fb059e3 fcc6e0c 84d5910 c46a1db cfc0510 c46a1db 1a40aec c46a1db 1a40aec 9787b16 3fee0c7 83c6700 3fee0c7 106b63b 3fee0c7 1a40aec 46c628d 9787b16 46c628d 163c158 3fee0c7 28ce15e 3fee0c7 28ce15e 46c628d 28ce15e 163c158 9787b16 84d5910 c46a1db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import torch
import spaces
import gradio as gr
from threading import Thread
import re
import time
import tempfile
import os
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, TextIteratorStreamer
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model.to("cuda:0")
ASR_MODEL_NAME = "openai/whisper-large-v3"
ASR_BATCH_SIZE = 8
ASR_CHUNK_LENGTH_S = 30
TEMP_FILE_LIMIT_MB = 1000
from huggingface_hub import InferenceClient
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
device = 0 if torch.cuda.is_available() else "cpu"
asr_pl = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
chunk_length_s=ASR_CHUNK_LENGTH_S,
device=device,
)
application_title = "Enlight Innovations Limited -- Demo"
application_description = "This demo is desgined to illustrate our basic idea and feasibility in implementation."
@spaces.GPU
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for message in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message.choices[0].delta.content
response += token
yield response
@spaces.GPU
def transcribe(asr_inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = asr_pl(asr_inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
#audio_input = gr.Audio(sources="upload", type="filepath", label="Audio: from file") #gr.Audio(sources="microphone", type="filepath", label="Audio: from microphone")
#audio_input_choice = gr.Radio(["audio file", "microphone"], label="Audio Input Source", value="audio file") #
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input Source")
task_input_choice = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
transcribe_interface = gr.Interface(
fn=transcribe,
inputs=[
audio_input,
#audio_input_choice,
task_input_choice,
],
outputs="text",
title=application_title,
description=application_description,
allow_flagging="never",
)
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
chatbot_sys_output = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
chatbot_max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
chatbot_temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
chatbot_top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
)
chat_interface = gr.ChatInterface(
respond,
title=application_title,
description=application_description,
additional_inputs=[
chatbot_sys_output,
chatbot_max_tokens,
chatbot_temperature,
chatbot_top_p,
],
)
with gr.Blocks() as demo:
gr.TabbedInterface([transcribe_interface, chat_interface], ["Step 1: Transcribe", "Step 2: Extract"])
"""
def clear_audio_input():
return None
def update_audio_input(audio_input_choice):
if audio_input_choice == "audio file":
return gr.Audio(sources="upload", label="Audio: from file") #, type="filepath", label="Audio: from file")
elif audio_input_choice == "microphone":
return gr.Audio(sources="microphone", label="Audio: from microphone") #, type="filepath", label="Audio: from microphone")
#audio_input_choice.input(fn=clear_audio_input, outputs=audio_input).then(fn=update_audio_input,
audio_input_choice.input(fn=update_audio_input,
inputs=audio_input_choice,
outputs=audio_input
)
"""
if __name__ == "__main__":
demo.queue().launch() #demo.launch() |