Spaces:

yellowcandle
/

whisper-v3-gradio

Running on Zero

App Files Files Community

whisper-v3-gradio / app.py

yellowcandle

Tried to add youtube video upload

4b18df1 unverified about 1 year ago

raw

history blame

4.08 kB

	import spaces
	import gradio as gr
	import os
	import logging
	from pytube import YouTube
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer

	def get_text(url):
	if url != '':
	output_text_transcribe = ''

	yt = YouTube(url)
	video = yt.streams.filter(only_audio=True).first()
	out_file = video.download(output_path=".")

	file_stats = os.stat(out_file)
	logging.info(f'Size of audio file in Bytes: {file_stats.st_size}')

	if file_stats.st_size <= 30000000:
	base, ext = os.path.splitext(out_file)
	new_file = base + '.mp3'
	os.rename(out_file, new_file)
	a = new_file

	result = model.transcribe(a)
	return result['text'].strip()
	else:
	logging.error('Videos for transcription on this space are limited to about 1.5 hours. Sorry about this limit but some joker thought they could stop this tool from working by transcribing many extremely long videos. Please visit https://steve.digital to contact me about this space.')

	@spaces.GPU(duration=60)
	def transcribe_audio(audio, model_id):
	if audio is None:
	return "Please upload an audio file."
	if model_id is None:
	return "Please select a model."

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=25,
	batch_size=16,
	torch_dtype=torch_dtype,
	device=device,
	)

	result = pipe(audio)
	return result["text"]

	def proofread(text):
	if text is None:
	return "Please provide the transcribed text for proofreading."

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	prompt = "用繁體中文整理這段文字，分段及改正錯別字，最後加上整段文字的重點。"

	model = AutoModelForCausalLM.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
	tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
	model.to(device)

	input_text = prompt + text
	input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
	output = model.generate(input_ids, max_length=len(input_ids[0]) + 50, num_return_sequences=1, temperature=0.7)
	proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)

	return proofread_text

	with gr.Blocks() as demo:
	gr.Markdown("""
	# Audio Transcription and Proofreading
	1. Upload an audio file (Wait for the file to be fully loaded first)
	2. Select a model for transcription
	3. Proofread the transcribed text
	""")

	with gr.Row():
	with gr.Column():
	audio = gr.Audio(sources="upload", type="filepath")
	input_text_url = gr.Textbox(label="Video URL")
	model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")

	transcribe_button = gr.Button("Transcribe")
	transcribed_text = gr.Textbox(label="Transcribed Text")

	proofread_button = gr.Button("Proofread")
	proofread_output = gr.Textbox(label="Proofread Text")

	transcribe_button.click(transcribe_audio, inputs=[audio, model_dropdown], outputs=transcribed_text)
	proofread_button.click(proofread, inputs=[transcribed_text], outputs=proofread_output)
	transcribed_text.change(proofread, inputs=[transcribed_text], outputs=proofread_output)

	demo.launch()