import gradio as gr from audio_processing import process_audio, print_results from transformers import pipeline import spaces import torch # Check if CUDA is available cuda_available = torch.cuda.is_available() # Initialize the summarization and question-answering models summarizer = pipeline("summarization", model="facebook/bart-large-cnn") qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") # Move models to GPU if available if cuda_available: summarizer.to('cuda') qa_model.to('cuda') @spaces.GPU def transcribe_audio(audio_file, translate, model_size): language_segments, final_segments = process_audio(audio_file, translate=translate, model_size=model_size) output = "Detected language changes:\n\n" for segment in language_segments: output += f"Language: {segment['language']}\n" output += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n" output += f"Transcription with language detection and speaker diarization (using {model_size} model):\n\n" full_text = "" for segment in final_segments: output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:\n" output += f"Original: {segment['text']}\n" if translate: output += f"Translated: {segment['translated']}\n" full_text += segment['translated'] + " " else: full_text += segment['text'] + " " output += "\n" return output, full_text @spaces.GPU def summarize_text(text): summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] return summary @spaces.GPU def answer_question(context, question): result = qa_model(question=question, context=context) return result['answer'] @spaces.GPU def process_and_summarize(audio_file, translate, model_size): transcription, full_text = transcribe_audio(audio_file, translate, model_size) summary = summarize_text(full_text) return transcription, summary @spaces.GPU def qa_interface(audio_file, translate, model_size, question): _, full_text = transcribe_audio(audio_file, translate, model_size) answer = answer_question(full_text, question) return answer # Main interface with gr.Blocks() as iface: gr.Markdown("# WhisperX Audio Transcription, Translation, Summarization, and QA (with ZeroGPU support)") with gr.Tab("Transcribe and Summarize"): audio_input = gr.Audio(type="filepath") translate_checkbox = gr.Checkbox(label="Enable Translation") model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small") transcribe_button = gr.Button("Transcribe and Summarize") transcription_output = gr.Textbox(label="Transcription") summary_output = gr.Textbox(label="Summary") transcribe_button.click( process_and_summarize, inputs=[audio_input, translate_checkbox, model_dropdown], outputs=[transcription_output, summary_output] ) with gr.Tab("Question Answering"): qa_audio_input = gr.Audio(type="filepath") qa_translate_checkbox = gr.Checkbox(label="Enable Translation") qa_model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small") question_input = gr.Textbox(label="Ask a question about the audio") qa_button = gr.Button("Get Answer") answer_output = gr.Textbox(label="Answer") qa_button.click( qa_interface, inputs=[qa_audio_input, qa_translate_checkbox, qa_model_dropdown, question_input], outputs=answer_output ) gr.Markdown( """ ## ZeroGPU Support This application supports ZeroGPU for Hugging Face Spaces pro users. GPU-intensive tasks are automatically optimized for better performance. """ ) iface.launch()