Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import gradio as gr
|
3 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
|
4 |
+
import torch
|
5 |
+
|
6 |
+
# Load Whisper model for transcription
|
7 |
+
whisper_model_name = "openai/whisper-large" # You can use base, small, medium, or large based on your needs
|
8 |
+
processor = WhisperProcessor.from_pretrained(whisper_model_name)
|
9 |
+
model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
|
10 |
+
|
11 |
+
# Initialize the language detection model (using zero-shot classification for language detection)
|
12 |
+
lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
13 |
+
|
14 |
+
# Function to transcribe audio to text using Whisper model
|
15 |
+
def transcribe_audio(audio_file):
|
16 |
+
# Prepare input features for Whisper
|
17 |
+
input_features = processor(audio_file, return_tensors="pt")
|
18 |
+
# Generate transcription
|
19 |
+
generated_ids = model.generate(input_features["input_features"])
|
20 |
+
transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
|
21 |
+
return transcription
|
22 |
+
|
23 |
+
# Function to detect the language of the transcription using zero-shot classification
|
24 |
+
def detect_language(text):
|
25 |
+
result = lang_detect_model(text, candidate_labels=["en", "fr", "es", "de", "it", "pt", "zh", "ja", "ar", "hi"])
|
26 |
+
return result['labels'][0], result['scores'][0] # Return the detected language and score
|
27 |
+
|
28 |
+
# Cleanup function to remove filler words and clean the transcription
|
29 |
+
def cleanup_text(text):
|
30 |
+
# Remove filler words like "uh", "um", etc.
|
31 |
+
text = re.sub(r'\b(uh|um|like|you know|so|actually|basically)\b', '', text, flags=re.IGNORECASE)
|
32 |
+
# Remove extra spaces
|
33 |
+
text = re.sub(r'\s+', ' ', text)
|
34 |
+
# Strip leading and trailing spaces
|
35 |
+
text = text.strip()
|
36 |
+
# Capitalize the first letter
|
37 |
+
text = text.capitalize()
|
38 |
+
return text
|
39 |
+
|
40 |
+
# Main function to process the audio and detect language
|
41 |
+
def process_audio(audio_file):
|
42 |
+
transcription = transcribe_audio(audio_file) # Transcribe audio to text
|
43 |
+
lang, score = detect_language(transcription) # Detect the language of the transcription
|
44 |
+
cleaned_text = cleanup_text(transcription) # Clean up the transcription
|
45 |
+
return cleaned_text, lang, score # Return cleaned transcription, language, and confidence score
|
46 |
+
|
47 |
+
# Gradio interface
|
48 |
+
with gr.Blocks() as demo:
|
49 |
+
with gr.Row():
|
50 |
+
with gr.Column():
|
51 |
+
audio_input = gr.Audio(label="Record your voice", type="filepath", scale=1) # Input for audio file
|
52 |
+
output_text = gr.Textbox(label="Transcription", scale=1) # Output text for transcription
|
53 |
+
output_lang = gr.Textbox(label="Detected Language", scale=1) # Output text for detected language
|
54 |
+
output_score = gr.Textbox(label="Confidence Score", scale=1) # Output confidence score
|
55 |
+
process_btn = gr.Button("Process Audio") # Button to process audio
|
56 |
+
|
57 |
+
process_btn.click(fn=process_audio, inputs=[audio_input], outputs=[output_text, output_lang, output_score])
|
58 |
+
|
59 |
+
demo.launch(debug=True)
|