Spaces:

ajchri5
/

Assignment-2-IT164_ajchri5

Runtime error

App Files Files Community

Assignment-2-IT164_ajchri5 / app.py

ajchri5

Update app.py

4570d8a verified 8 months ago

raw

history blame

3.69 kB

	import re
	import gradio as gr
	from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
	import torch
	import numpy as np

	# Load Whisper model for transcription
	whisper_model_name = "openai/whisper-large"
	processor = WhisperProcessor.from_pretrained(whisper_model_name)
	model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)

	# Initialize the language detection model
	lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

	# Function to transcribe audio to text using Whisper model
	def transcribe_audio(audio_file):
	# Check if audio_file is a list (Gradio returns a list when multiple clips are recorded)
	if isinstance(audio_file, list):
	audio = np.concatenate(audio_file) # Concatenate the list of arrays into a single 1D array
	else:
	audio = np.array(audio_file) # Ensure it's a 1D array

	# Ensure the shape is 1D (if the shape is (2, N), we flatten it)
	if len(audio.shape) > 1:
	audio = audio.flatten()

	# Prepare input features for Whisper (sampling rate should be 16000 for Whisper)
	input_features = processor(audio, return_tensors="pt", sampling_rate=16000)

	# Generate transcription
	generated_ids = model.generate(input_features["input_features"])
	transcription = processor.decode(generated_ids[0], skip_special_tokens=True)

	return transcription

	# Function to detect the language of the transcription using zero-shot classification
	def detect_language(text):
	result = lang_detect_model(text, candidate_labels=["en", "fr", "es", "de", "it", "pt", "zh", "ja", "ar", "hi"])
	return result['labels'][0], result['scores'][0] # Return the detected language and score

	# Cleanup function to remove filler words and clean the transcription
	def cleanup_text(text):
	# Remove filler words like "uh", "um", etc.
	text = re.sub(r'\b(uh\|um\|like\|you know\|so\|actually\|basically)\b', '', text, flags=re.IGNORECASE)
	# Remove extra spaces
	text = re.sub(r'\s+', ' ', text)
	# Strip leading and trailing spaces
	text = text.strip()
	# Capitalize the first letter
	text = text.capitalize()
	return text

	# Main function to process the audio and detect language
	def process_audio(audio_file):
	try:
	transcription = transcribe_audio(audio_file) # Transcribe audio to text
	if not transcription.strip(): # If transcription is empty or just whitespace
	raise ValueError("Transcription is empty.")

	lang, score = detect_language(transcription) # Detect the language of the transcription
	cleaned_text = cleanup_text(transcription) # Clean up the transcription

	return cleaned_text, lang, score # Return cleaned transcription, language, and confidence score

	except Exception as e:
	# If any error occurs, return the error message
	return f"Error: {str(e)}", "", ""

	# Gradio interface
	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(label="Record your voice", type="numpy", scale=1) # Input for live audio (microphone)
	output_text = gr.Textbox(label="Transcription", scale=1) # Output text for transcription
	output_lang = gr.Textbox(label="Detected Language", scale=1) # Output text for detected language
	output_score = gr.Textbox(label="Confidence Score", scale=1) # Output confidence score
	process_btn = gr.Button("Process Audio") # Button to process audio

	process_btn.click(fn=process_audio, inputs=[audio_input], outputs=[output_text, output_lang, output_score])

	demo.launch(debug=True)