ajchri5 commited on
Commit
42fff29
·
verified ·
1 Parent(s): e8bafca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -0
app.py CHANGED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import gradio as gr
3
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
4
+ import torch
5
+
6
+ # Load Whisper model for transcription
7
+ whisper_model_name = "openai/whisper-large" # You can use base, small, medium, or large based on your needs
8
+ processor = WhisperProcessor.from_pretrained(whisper_model_name)
9
+ model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
10
+
11
+ # Initialize the language detection model (using zero-shot classification for language detection)
12
+ lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
13
+
14
+ # Function to transcribe audio to text using Whisper model
15
+ def transcribe_audio(audio_file):
16
+ # Prepare input features for Whisper
17
+ input_features = processor(audio_file, return_tensors="pt")
18
+ # Generate transcription
19
+ generated_ids = model.generate(input_features["input_features"])
20
+ transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
21
+ return transcription
22
+
23
+ # Function to detect the language of the transcription using zero-shot classification
24
+ def detect_language(text):
25
+ result = lang_detect_model(text, candidate_labels=["en", "fr", "es", "de", "it", "pt", "zh", "ja", "ar", "hi"])
26
+ return result['labels'][0], result['scores'][0] # Return the detected language and score
27
+
28
+ # Cleanup function to remove filler words and clean the transcription
29
+ def cleanup_text(text):
30
+ # Remove filler words like "uh", "um", etc.
31
+ text = re.sub(r'\b(uh|um|like|you know|so|actually|basically)\b', '', text, flags=re.IGNORECASE)
32
+ # Remove extra spaces
33
+ text = re.sub(r'\s+', ' ', text)
34
+ # Strip leading and trailing spaces
35
+ text = text.strip()
36
+ # Capitalize the first letter
37
+ text = text.capitalize()
38
+ return text
39
+
40
+ # Main function to process the audio and detect language
41
+ def process_audio(audio_file):
42
+ transcription = transcribe_audio(audio_file) # Transcribe audio to text
43
+ lang, score = detect_language(transcription) # Detect the language of the transcription
44
+ cleaned_text = cleanup_text(transcription) # Clean up the transcription
45
+ return cleaned_text, lang, score # Return cleaned transcription, language, and confidence score
46
+
47
+ # Gradio interface
48
+ with gr.Blocks() as demo:
49
+ with gr.Row():
50
+ with gr.Column():
51
+ audio_input = gr.Audio(label="Record your voice", type="filepath", scale=1) # Input for audio file
52
+ output_text = gr.Textbox(label="Transcription", scale=1) # Output text for transcription
53
+ output_lang = gr.Textbox(label="Detected Language", scale=1) # Output text for detected language
54
+ output_score = gr.Textbox(label="Confidence Score", scale=1) # Output confidence score
55
+ process_btn = gr.Button("Process Audio") # Button to process audio
56
+
57
+ process_btn.click(fn=process_audio, inputs=[audio_input], outputs=[output_text, output_lang, output_score])
58
+
59
+ demo.launch(debug=True)