Spaces:
Sleeping
Sleeping
import gradio as gr | |
from audio_processing import process_audio, print_results | |
from transformers import pipeline | |
import spaces | |
import torch | |
# Check if CUDA is available | |
cuda_available = torch.cuda.is_available() | |
# Initialize the summarization and question-answering models | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") | |
# Move models to GPU if available | |
if cuda_available: | |
summarizer.to('cuda') | |
qa_model.to('cuda') | |
def transcribe_audio(audio_file, translate, model_size): | |
language_segments, final_segments = process_audio(audio_file, translate=translate, model_size=model_size) | |
output = "Detected language changes:\n\n" | |
for segment in language_segments: | |
output += f"Language: {segment['language']}\n" | |
output += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n" | |
output += f"Transcription with language detection and speaker diarization (using {model_size} model):\n\n" | |
full_text = "" | |
for segment in final_segments: | |
output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:\n" | |
output += f"Original: {segment['text']}\n" | |
if translate: | |
output += f"Translated: {segment['translated']}\n" | |
full_text += segment['translated'] + " " | |
else: | |
full_text += segment['text'] + " " | |
output += "\n" | |
return output, full_text | |
def summarize_text(text): | |
summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] | |
return summary | |
def answer_question(context, question): | |
result = qa_model(question=question, context=context) | |
return result['answer'] | |
def process_and_summarize(audio_file, translate, model_size): | |
transcription, full_text = transcribe_audio(audio_file, translate, model_size) | |
summary = summarize_text(full_text) | |
return transcription, summary | |
def qa_interface(audio_file, translate, model_size, question): | |
_, full_text = transcribe_audio(audio_file, translate, model_size) | |
answer = answer_question(full_text, question) | |
return answer | |
# Main interface | |
with gr.Blocks() as iface: | |
gr.Markdown("# WhisperX Audio Transcription, Translation, Summarization, and QA (with ZeroGPU support)") | |
with gr.Tab("Transcribe and Summarize"): | |
audio_input = gr.Audio(type="filepath") | |
translate_checkbox = gr.Checkbox(label="Enable Translation") | |
model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small") | |
transcribe_button = gr.Button("Transcribe and Summarize") | |
transcription_output = gr.Textbox(label="Transcription") | |
summary_output = gr.Textbox(label="Summary") | |
transcribe_button.click( | |
process_and_summarize, | |
inputs=[audio_input, translate_checkbox, model_dropdown], | |
outputs=[transcription_output, summary_output] | |
) | |
with gr.Tab("Question Answering"): | |
qa_audio_input = gr.Audio(type="filepath") | |
qa_translate_checkbox = gr.Checkbox(label="Enable Translation") | |
qa_model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small") | |
question_input = gr.Textbox(label="Ask a question about the audio") | |
qa_button = gr.Button("Get Answer") | |
answer_output = gr.Textbox(label="Answer") | |
qa_button.click( | |
qa_interface, | |
inputs=[qa_audio_input, qa_translate_checkbox, qa_model_dropdown, question_input], | |
outputs=answer_output | |
) | |
gr.Markdown( | |
""" | |
## ZeroGPU Support | |
This application supports ZeroGPU for Hugging Face Spaces pro users. | |
GPU-intensive tasks are automatically optimized for better performance. | |
""" | |
) | |
iface.launch() |