Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,22 +2,29 @@ import re
|
|
2 |
import gradio as gr
|
3 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
|
4 |
import torch
|
|
|
|
|
5 |
|
6 |
# Load Whisper model for transcription
|
7 |
-
whisper_model_name = "openai/whisper-large"
|
8 |
processor = WhisperProcessor.from_pretrained(whisper_model_name)
|
9 |
model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
|
10 |
|
11 |
-
# Initialize the language detection model
|
12 |
lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
13 |
|
14 |
# Function to transcribe audio to text using Whisper model
|
15 |
def transcribe_audio(audio_file):
|
|
|
|
|
|
|
16 |
# Prepare input features for Whisper
|
17 |
-
input_features = processor(
|
|
|
18 |
# Generate transcription
|
19 |
generated_ids = model.generate(input_features["input_features"])
|
20 |
transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
|
|
|
21 |
return transcription
|
22 |
|
23 |
# Function to detect the language of the transcription using zero-shot classification
|
@@ -39,16 +46,25 @@ def cleanup_text(text):
|
|
39 |
|
40 |
# Main function to process the audio and detect language
|
41 |
def process_audio(audio_file):
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# Gradio interface
|
48 |
with gr.Blocks() as demo:
|
49 |
with gr.Row():
|
50 |
with gr.Column():
|
51 |
-
audio_input = gr.Audio(label="Record your voice", type="
|
52 |
output_text = gr.Textbox(label="Transcription", scale=1) # Output text for transcription
|
53 |
output_lang = gr.Textbox(label="Detected Language", scale=1) # Output text for detected language
|
54 |
output_score = gr.Textbox(label="Confidence Score", scale=1) # Output confidence score
|
|
|
2 |
import gradio as gr
|
3 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
|
4 |
import torch
|
5 |
+
import numpy as np
|
6 |
+
import librosa
|
7 |
|
8 |
# Load Whisper model for transcription
|
9 |
+
whisper_model_name = "openai/whisper-large"
|
10 |
processor = WhisperProcessor.from_pretrained(whisper_model_name)
|
11 |
model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
|
12 |
|
13 |
+
# Initialize the language detection model
|
14 |
lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
15 |
|
16 |
# Function to transcribe audio to text using Whisper model
|
17 |
def transcribe_audio(audio_file):
|
18 |
+
# Ensure the audio is a numpy array (Gradio input type for audio is numpy)
|
19 |
+
audio = np.array(audio_file)
|
20 |
+
|
21 |
# Prepare input features for Whisper
|
22 |
+
input_features = processor(audio, return_tensors="pt", sampling_rate=16000)
|
23 |
+
|
24 |
# Generate transcription
|
25 |
generated_ids = model.generate(input_features["input_features"])
|
26 |
transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
|
27 |
+
|
28 |
return transcription
|
29 |
|
30 |
# Function to detect the language of the transcription using zero-shot classification
|
|
|
46 |
|
47 |
# Main function to process the audio and detect language
|
48 |
def process_audio(audio_file):
|
49 |
+
try:
|
50 |
+
transcription = transcribe_audio(audio_file) # Transcribe audio to text
|
51 |
+
if not transcription.strip(): # If transcription is empty or just whitespace
|
52 |
+
raise ValueError("Transcription is empty.")
|
53 |
+
|
54 |
+
lang, score = detect_language(transcription) # Detect the language of the transcription
|
55 |
+
cleaned_text = cleanup_text(transcription) # Clean up the transcription
|
56 |
+
|
57 |
+
return cleaned_text, lang, score # Return cleaned transcription, language, and confidence score
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
# If any error occurs, return the error message
|
61 |
+
return f"Error: {str(e)}", "", ""
|
62 |
|
63 |
# Gradio interface
|
64 |
with gr.Blocks() as demo:
|
65 |
with gr.Row():
|
66 |
with gr.Column():
|
67 |
+
audio_input = gr.Audio(label="Record your voice", type="numpy", scale=1) # Input for live audio (microphone)
|
68 |
output_text = gr.Textbox(label="Transcription", scale=1) # Output text for transcription
|
69 |
output_lang = gr.Textbox(label="Detected Language", scale=1) # Output text for detected language
|
70 |
output_score = gr.Textbox(label="Confidence Score", scale=1) # Output confidence score
|