Spaces:

Merlintxu
/

Wav2Txt

Sleeping

App Files Files Community

Wav2Txt / app.py

Merlintxu

Update app.py

86a050b verified 11 months ago

raw

history blame

6.76 kB

	import gradio as gr
	from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
	import torch
	import librosa
	import subprocess
	from langdetect import detect_langs
	import os
	import warnings
	from transformers import logging
	import math
	import json

	# Suppress warnings
	warnings.filterwarnings("ignore")
	logging.set_verbosity_error()

	# Updated models by language
	MODELS = {
	"es": [
	"openai/whisper-large-v3",
	"facebook/wav2vec2-large-xlsr-53-spanish",
	"jonatasgrosman/wav2vec2-xls-r-1b-spanish"
	],
	"en": [
	"openai/whisper-large-v3",
	"facebook/wav2vec2-large-960h",
	"microsoft/wav2vec2-base-960h"
	],
	"pt": [
	"facebook/wav2vec2-large-xlsr-53-portuguese",
	"openai/whisper-medium",
	"jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
	]
	}

	def convert_audio_to_wav(audio_path):
	if os.path.isdir(audio_path):
	raise ValueError(f"The path provided is a directory: {audio_path}")
	wav_path = "converted_audio.wav"
	command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
	subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	return wav_path

	def detect_language(audio_path):
	try:
	speech, _ = librosa.load(audio_path, sr=16000, duration=30)
	except Exception as e:
	raise ValueError(f"Error loading audio file with librosa: {e}")

	processor = WhisperProcessor.from_pretrained("openai/whisper-base")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

	input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
	predicted_ids = model.generate(input_features)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	langs = detect_langs(transcription)

	es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
	pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)

	if abs(es_confidence - pt_confidence) < 0.2:
	return 'es'

	return max(langs, key=lambda x: x.prob).lang

	def transcribe_audio_stream(audio, model_name):
	wav_audio = convert_audio_to_wav(audio)
	speech, rate = librosa.load(wav_audio, sr=16000)
	duration = len(speech) / rate

	transcriptions = []

	if "whisper" in model_name:
	processor = WhisperProcessor.from_pretrained(model_name)
	model = WhisperForConditionalGeneration.from_pretrained(model_name)

	chunk_duration = 30 # seconds

	for i in range(0, int(duration), chunk_duration):
	end = min(i + chunk_duration, duration)
	chunk = speech[int(i * rate):int(end * rate)]

	input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
	predicted_ids = model.generate(input_features)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	progress = min(100, (end / duration) * 100)
	transcriptions.append({
	"start_time": i,
	"end_time": end,
	"text": transcription
	})
	yield transcriptions, progress
	else:
	transcriber = pipeline("automatic-speech-recognition", model=model_name)

	chunk_duration = 10 # seconds

	for i in range(0, int(duration), chunk_duration):
	end = min(i + chunk_duration, duration)
	chunk = speech[int(i * rate):int(end * rate)]
	result = transcriber(chunk)

	progress = min(100, (end / duration) * 100)
	transcriptions.append({
	"start_time": i,
	"end_time": end,
	"text": result["text"]
	})
	yield transcriptions, progress

	def detect_and_select_model(audio):
	wav_audio = convert_audio_to_wav(audio)
	language = detect_language(wav_audio)
	model_options = MODELS.get(language, MODELS["en"])
	return language, model_options

	def save_transcription(transcriptions, file_format):
	if file_format == "JSON":
	file_path = "transcription.json"
	with open(file_path, 'w') as f:
	json.dump(transcriptions, f, ensure_ascii=False, indent=4)
	elif file_format == "TXT":
	file_path = "transcription.txt"
	with open(file_path, 'w') as f:
	for entry in transcriptions:
	f.write(f"{entry['start_time']},{entry['end_time']},{entry['text']}\n")
	return file_path

	def combined_interface(audio, file_format):
	try:
	language, model_options = detect_and_select_model(audio)
	selected_model = model_options[0]

	yield language, model_options, selected_model, "", 0, "Initializing..."

	transcriptions = []
	for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
	transcriptions = partial_transcriptions
	full_transcription = " ".join([t["text"] for t in transcriptions])
	progress_int = math.floor(progress)
	status = f"Transcribing... {progress_int}% complete"
	yield language, model_options, selected_model, full_transcription.strip(), progress_int, status

	# Save transcription file
	file_path = save_transcription(transcriptions, file_format)

	# Clean up temporary files
	os.remove("converted_audio.wav")

	yield language, model_options, selected_model, full_transcription.strip(), 100, f"Transcription complete! Download {file_path}", file_path

	except Exception as e:
	yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""

	iface = gr.Interface(
	fn=combined_interface,
	inputs=[
	gr.Audio(type="filepath"),
	gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
	],
	outputs=[
	gr.Textbox(label="Detected Language"),
	gr.Dropdown(label="Available Models", choices=[]),
	gr.Textbox(label="Selected Model"),
	gr.Textbox(label="Transcription", lines=10),
	gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
	gr.Textbox(label="Status"),
	gr.File(label="Download Transcription")
	],
	title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
	description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
	live=True
	)

	if __name__ == "__main__":
	iface.queue().launch()