Spaces:

kh-CHEUNG
/

EIL-Demo

Sleeping

App Files Files

EIL-Demo / app.py

kh-CHEUNG

Update app.py

b9153c5 verified 11 months ago

raw

history blame

5.19 kB

	import torch
	import spaces

	import gradio as gr
	from threading import Thread
	import re
	import time
	import tempfile
	import os

	from transformers import pipeline
	from transformers.pipelines.audio_utils import ffmpeg_read

	from PIL import Image

	from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, TextIteratorStreamer
	processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
	model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
	model.to("cuda:0")

	ASR_MODEL_NAME = "openai/whisper-large-v3"
	ASR_BATCH_SIZE = 8
	ASR_CHUNK_LENGTH_S = 30
	TEMP_FILE_LIMIT_MB = 1000

	from huggingface_hub import InferenceClient
	"""
	For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
	"""
	client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

	device = 0 if torch.cuda.is_available() else "cpu"

	asr_pl = pipeline(
	task="automatic-speech-recognition",
	model=ASR_MODEL_NAME,
	chunk_length_s=ASR_CHUNK_LENGTH_S,
	device=device,
	)

	application_title = "Enlight Innovations Limited -- Demo"
	application_description = "This demo is desgined to illustrate our basic idea and feasibility in implementation."

	@spaces.GPU
	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):

	messages = [{"role": "system", "content": system_message}]

	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})

	messages.append({"role": "user", "content": message})

	response = ""

	for message in client.chat_completion(
	messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	token = message.choices[0].delta.content

	response += token
	yield response

	@spaces.GPU
	def transcribe(asr_inputs, task):
	#print("Type: " + str(type(asr_inputs)))
	if asr_inputs is None:
	raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

	text = asr_pl(asr_inputs, batch_size=ASR_BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
	return text


	"""Gradio User Interface"""
	#audio_input = gr.Audio(sources="upload", type="filepath", label="Audio: from file") #gr.Audio(sources="microphone", type="filepath", label="Audio: from microphone")
	#audio_input_choice = gr.Radio(["audio file", "microphone"], label="Audio Input Source", value="audio file") #

	audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input Source")
	task_input_choice = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")

	transcribe_interface = gr.Interface(
	fn=transcribe,
	inputs=[
	audio_input,
	#audio_input_choice,
	task_input_choice,
	],
	outputs="text",
	title=application_title,
	description=application_description,
	allow_flagging="never",
	)


	"""
	For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
	"""
	chatbot_main = gr.Chatbot(label="Extraction Output")
	chatbot_sys_output = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
	chatbot_max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
	chatbot_temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
	chatbot_top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)",
	)

	chat_interface = gr.ChatInterface(
	respond,
	title=application_title,
	description=application_description,
	chatbot=chatbot_main,
	additional_inputs=[
	chatbot_sys_output,
	chatbot_max_tokens,
	chatbot_temperature,
	chatbot_top_p,
	],
	)

	with gr.Blocks() as demo:
	gr.TabbedInterface([transcribe_interface, chat_interface], ["Step 1: Transcribe", "Step 2: Extract"])

	"""
	def clear_audio_input():
	return None

	def update_audio_input(audio_input_choice):
	if audio_input_choice == "audio file":
	return gr.Audio(sources="upload", label="Audio: from file") #, type="filepath", label="Audio: from file")
	elif audio_input_choice == "microphone":
	return gr.Audio(sources="microphone", label="Audio: from microphone") #, type="filepath", label="Audio: from microphone")

	#audio_input_choice.input(fn=clear_audio_input, outputs=audio_input).then(fn=update_audio_input,
	audio_input_choice.input(fn=update_audio_input,
	inputs=audio_input_choice,
	outputs=audio_input
	)
	"""

	if __name__ == "__main__":
	demo.queue().launch() #demo.launch()