ajchri5 commited on
Commit
e4528d2
·
verified ·
1 Parent(s): 1ee60bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -8
app.py CHANGED
@@ -2,22 +2,29 @@ import re
2
  import gradio as gr
3
  from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
4
  import torch
 
 
5
 
6
  # Load Whisper model for transcription
7
- whisper_model_name = "openai/whisper-large" # You can use base, small, medium, or large based on your needs
8
  processor = WhisperProcessor.from_pretrained(whisper_model_name)
9
  model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
10
 
11
- # Initialize the language detection model (using zero-shot classification for language detection)
12
  lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
13
 
14
  # Function to transcribe audio to text using Whisper model
15
  def transcribe_audio(audio_file):
 
 
 
16
  # Prepare input features for Whisper
17
- input_features = processor(audio_file, return_tensors="pt")
 
18
  # Generate transcription
19
  generated_ids = model.generate(input_features["input_features"])
20
  transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
 
21
  return transcription
22
 
23
  # Function to detect the language of the transcription using zero-shot classification
@@ -39,16 +46,25 @@ def cleanup_text(text):
39
 
40
  # Main function to process the audio and detect language
41
  def process_audio(audio_file):
42
- transcription = transcribe_audio(audio_file) # Transcribe audio to text
43
- lang, score = detect_language(transcription) # Detect the language of the transcription
44
- cleaned_text = cleanup_text(transcription) # Clean up the transcription
45
- return cleaned_text, lang, score # Return cleaned transcription, language, and confidence score
 
 
 
 
 
 
 
 
 
46
 
47
  # Gradio interface
48
  with gr.Blocks() as demo:
49
  with gr.Row():
50
  with gr.Column():
51
- audio_input = gr.Audio(label="Record your voice", type="filepath", scale=1) # Input for audio file
52
  output_text = gr.Textbox(label="Transcription", scale=1) # Output text for transcription
53
  output_lang = gr.Textbox(label="Detected Language", scale=1) # Output text for detected language
54
  output_score = gr.Textbox(label="Confidence Score", scale=1) # Output confidence score
 
2
  import gradio as gr
3
  from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
4
  import torch
5
+ import numpy as np
6
+ import librosa
7
 
8
  # Load Whisper model for transcription
9
+ whisper_model_name = "openai/whisper-large"
10
  processor = WhisperProcessor.from_pretrained(whisper_model_name)
11
  model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
12
 
13
+ # Initialize the language detection model
14
  lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
15
 
16
  # Function to transcribe audio to text using Whisper model
17
  def transcribe_audio(audio_file):
18
+ # Ensure the audio is a numpy array (Gradio input type for audio is numpy)
19
+ audio = np.array(audio_file)
20
+
21
  # Prepare input features for Whisper
22
+ input_features = processor(audio, return_tensors="pt", sampling_rate=16000)
23
+
24
  # Generate transcription
25
  generated_ids = model.generate(input_features["input_features"])
26
  transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
27
+
28
  return transcription
29
 
30
  # Function to detect the language of the transcription using zero-shot classification
 
46
 
47
  # Main function to process the audio and detect language
48
  def process_audio(audio_file):
49
+ try:
50
+ transcription = transcribe_audio(audio_file) # Transcribe audio to text
51
+ if not transcription.strip(): # If transcription is empty or just whitespace
52
+ raise ValueError("Transcription is empty.")
53
+
54
+ lang, score = detect_language(transcription) # Detect the language of the transcription
55
+ cleaned_text = cleanup_text(transcription) # Clean up the transcription
56
+
57
+ return cleaned_text, lang, score # Return cleaned transcription, language, and confidence score
58
+
59
+ except Exception as e:
60
+ # If any error occurs, return the error message
61
+ return f"Error: {str(e)}", "", ""
62
 
63
  # Gradio interface
64
  with gr.Blocks() as demo:
65
  with gr.Row():
66
  with gr.Column():
67
+ audio_input = gr.Audio(label="Record your voice", type="numpy", scale=1) # Input for live audio (microphone)
68
  output_text = gr.Textbox(label="Transcription", scale=1) # Output text for transcription
69
  output_lang = gr.Textbox(label="Detected Language", scale=1) # Output text for detected language
70
  output_score = gr.Textbox(label="Confidence Score", scale=1) # Output confidence score