""" Gradio interface module Contains all UI components and interface logic """ import gradio as gr import asyncio import os from ..tools import mcp_tools from ..tools.download_tools import get_file_info_tool, get_mp3_files_tool, read_text_file_segments_tool from ..tools.transcription_tools import transcribe_audio_file_tool def write_text_file_content(file_path: str, content: str, mode: str = "w", position: int = None): """Simple text file writing function""" try: if mode == "r+" and position is not None: with open(file_path, mode, encoding='utf-8') as f: f.seek(position) characters_written = f.write(content) else: with open(file_path, mode, encoding='utf-8') as f: characters_written = f.write(content) return { "status": "success", "characters_written": characters_written, "operation_type": mode, "size_change": len(content) } except Exception as e: return { "status": "failed", "error_message": str(e) } def temporarily_set_hf_token(hf_token: str): """Temporarily set HF_TOKEN in environment""" original_token = os.environ.get("HF_TOKEN") if hf_token and hf_token.strip(): os.environ["HF_TOKEN"] = hf_token.strip() print(f"šŸ”‘ Using user-provided HF_TOKEN: {hf_token[:10]}...") return original_token def restore_hf_token(original_token: str): """Restore original HF_TOKEN in environment""" if original_token is not None: os.environ["HF_TOKEN"] = original_token elif "HF_TOKEN" in os.environ: del os.environ["HF_TOKEN"] def get_default_directories(): """Get default directories based on current environment""" import pathlib # Detect environment is_modal = os.environ.get("MODAL_ENVIRONMENT") == "1" or os.path.exists("/modal") is_docker = os.path.exists("/.dockerenv") current_dir = pathlib.Path.cwd() # Base directories base_dirs = [] if is_modal: # Modal environment - use cache directories base_dirs.extend([ "/root/cache/apple_podcasts", "/root/cache/xyz_podcasts", "/tmp/downloads" ]) elif is_docker: # Docker environment base_dirs.extend([ "/app/downloads", "/data/downloads", "/tmp/downloads" ]) else: # Local environment - use current directory and common locations base_dirs.extend([ str(current_dir / "downloads"), str(current_dir / "cache" / "apple_podcasts"), str(current_dir / "cache" / "xyz_podcasts"), "~/Downloads", "~/Music" ]) # Add common directories base_dirs.extend(["/tmp", "."]) # Filter out duplicates while preserving order seen = set() unique_dirs = [] for d in base_dirs: if d not in seen: seen.add(d) unique_dirs.append(d) # Determine default directory default_dir = unique_dirs[0] if unique_dirs else str(current_dir / "downloads") return unique_dirs, default_dir def create_gradio_interface(): """Create Gradio interface Returns: gr.Blocks: Configured Gradio interface """ with gr.Blocks(title="ModalTranscriberMCP") as demo: gr.Markdown("# šŸŽ™ļø ModalTranscriberMCP") gr.Markdown("**Advanced Audio Transcription with Modal Cloud Computing & MCP Integration**") # Performance Highlight gr.Markdown(""" ### ⚔ **Supercharged by Modal Serverless GPU** šŸš€ **10-20x Faster Processing**: 1-hour audio transcribed in just 3-6 minutes šŸŽÆ **Parallel GPU Processing**: Up to 10 concurrent GPU containers ā˜ļø **Zero Infrastructure Management**: Fully serverless, pay-per-use """, elem_classes=["performance-highlight"]) # MCP Usage Instructions gr.Markdown(""" ### šŸ“š How to Use This MCP Server This application provides both a **Web UI** and **MCP (Model Context Protocol) Tools** for AI assistants A youtube video demo is below: [![YouTube Video](https://img.youtube.com/vi/Ut5jw7Epb0o/0.jpg)](https://youtu.be/Ut5jw7Epb0o) #### 🌐 Web Interface - Use the tabs above to download podcasts and transcribe audio files - Support for multi-platform downloads (Apple Podcasts, XiaoYuZhou) - Advanced features: speaker diarization (requires Hugging Face Token), multiple output formats - **High-speed processing**: Powered by Modal's distributed GPU infrastructure #### šŸ¤– MCP Integration **For AI Assistants (Claude, etc.):** ``` MCP Server URL: /api/mcp Available Tools: • transcribe_audio_file_tool - High-quality audio transcription • download_apple_podcast_tool - Apple Podcasts audio download • download_xyz_podcast_tool - XiaoYuZhou podcast download • get_mp3_files_tool - Scan directories for audio files • get_file_info_tool - Get file information • read_text_file_segments_tool - Read large text files in chunks ``` **Connect via MCP Client:** we deploy mcp server on modal, and we can use the url to connect to the gradio ui the url is: http://richardsucran--gradio-mcp-ui-app-entry.modal.run and json config is: ```json { "mcpServers": { "podcast-mcp": { "url": "http://richardsucran--gradio-mcp-ui-app-entry.modal.run/api/mcp" } } } ``` """, elem_classes=["mcp-instructions"]) # ==================== Podcast Download Tab ==================== with gr.Tab("Podcast Download"): gr.Markdown("### šŸŽ™ļø Download Podcast Audio") url_input = gr.Textbox( label="Podcast Link", placeholder="Enter podcast page URL", lines=1 ) platform_choice = gr.Radio( choices=["Apple Podcast", "XiaoYuZhou"], label="Select Podcast Platform", value="Apple Podcast" ) # Transcription options with gr.Row(): auto_transcribe = gr.Checkbox( label="Auto-transcribe after download", value=True, info="Start transcription immediately after download" ) enable_speaker_diarization = gr.Checkbox( label="Enable speaker diarization", value=False, info="Identify different speakers (requires Hugging Face Token)" ) # HF Token input for speaker diarization hf_token_input_download = gr.Textbox( label="Hugging Face Token (Optional)", placeholder="Enter your HF token here to override environment variable", type="password", info="Required for speaker diarization. If provided, will override HF_TOKEN environment variable." ) download_btn = gr.Button("šŸ“„ Start Download", variant="primary") result_output = gr.JSON(label="Download Results") async def download_podcast_and_transcribe(url, platform, auto_transcribe, enable_speaker, hf_token): """Call corresponding download tool based on selected platform""" # Temporarily set HF_TOKEN if provided original_token = temporarily_set_hf_token(hf_token) try: if platform == "Apple Podcast": download_result = await mcp_tools.download_apple_podcast(url) else: download_result = await mcp_tools.download_xyz_podcast(url) # 2. Check if download was successful if download_result["status"] != "success": return { "download_status": "failed", "error_message": download_result.get("error_message", "Download failed"), "transcription_status": "not_started" } # 3. If not auto-transcribing, return only download results if not auto_transcribe: return { "download_status": "success", "audio_file": download_result["audio_file_path"], "transcription_status": "skipped (user chose not to auto-transcribe)" } # 4. Start transcription try: audio_path = download_result["audio_file_path"] print(f"Transcribing audio file: {audio_path}") transcribe_result = await mcp_tools.transcribe_audio_file( audio_path, model_size="turbo", language=None, output_format="srt", enable_speaker_diarization=enable_speaker ) # 5. Merge results result = { "download_status": "success", "audio_file": audio_path, "transcription_status": "success", "txt_file_path": transcribe_result.get("txt_file_path"), "srt_file_path": transcribe_result.get("srt_file_path"), "transcription_details": { "model_used": transcribe_result.get("model_used"), "segment_count": transcribe_result.get("segment_count"), "audio_duration": transcribe_result.get("audio_duration"), "saved_files": transcribe_result.get("saved_files", []), "speaker_diarization_enabled": transcribe_result.get("speaker_diarization_enabled", False) } } # 6. Add speaker diarization info if enabled if enable_speaker and transcribe_result.get("speaker_diarization_enabled", False): result["speaker_diarization"] = { "global_speaker_count": transcribe_result.get("global_speaker_count", 0), "speaker_summary": transcribe_result.get("speaker_summary", {}) } return result except Exception as e: return { "download_status": "success", "audio_file": download_result["audio_file_path"], "transcription_status": "failed", "error_message": str(e) } finally: # Restore original HF_TOKEN restore_hf_token(original_token) # Bind callback function download_btn.click( download_podcast_and_transcribe, inputs=[url_input, platform_choice, auto_transcribe, enable_speaker_diarization, hf_token_input_download], outputs=result_output ) # ==================== Audio Transcription Tab ==================== with gr.Tab("Audio Transcription"): gr.Markdown("### šŸŽ¤ Audio Transcription and Speaker Diarization") gr.Markdown("Upload audio files for high-quality transcription with speaker diarization support") with gr.Row(): with gr.Column(scale=2): # Audio file input audio_file_input = gr.Textbox( label="Audio File Path", placeholder="Enter complete path to audio file (supports mp3, wav, m4a, etc.)", lines=1 ) # Transcription parameter settings with gr.Row(): model_size_choice = gr.Dropdown( choices=["tiny", "base", "small", "medium", "large", "turbo"], value="turbo", label="Model Size", info="Affects transcription accuracy and speed" ) language_choice = gr.Dropdown( choices=["auto", "zh", "en", "ja", "ko", "fr", "de", "es"], value="auto", label="Language", info="auto=auto-detect" ) with gr.Row(): with gr.Column(): output_format_choice = gr.Radio( choices=["srt", "txt", "json"], value="srt", label="Output Format" ) with gr.Column(): enable_speaker_separation = gr.Checkbox( label="Enable speaker diarization", value=False, info="Requires Hugging Face Token" ) # HF Token input for speaker diarization hf_token_input_transcribe = gr.Textbox( label="Hugging Face Token (Optional)", placeholder="Enter your HF token here to override environment variable", type="password", info="Required for speaker diarization. If provided, will override HF_TOKEN environment variable." ) transcribe_btn = gr.Button("šŸŽ¤ Start Transcription", variant="primary", size="lg") with gr.Column(scale=1): # Audio file information audio_info_output = gr.JSON(label="Audio File Information", visible=False) # Transcription progress and status transcribe_status = gr.Textbox( label="Transcription Status", value="Waiting to start transcription...", interactive=False, lines=3 ) # Transcription results display transcribe_result_output = gr.JSON( label="Transcription Results", visible=True ) # Speaker diarization results (if enabled) speaker_info_output = gr.JSON( label="Speaker Diarization Information", visible=False ) def perform_transcription(audio_path, model_size, language, output_format, enable_speaker, hf_token): """Execute audio transcription""" if not audio_path.strip(): return { "error": "Please enter audio file path" }, "Transcription failed: No audio file selected", gr.update(visible=False) # Temporarily set HF_TOKEN if provided original_token = temporarily_set_hf_token(hf_token) try: # Check if file exists import asyncio file_info = asyncio.run(get_file_info_tool(audio_path)) if file_info["status"] != "success": return { "error": f"File does not exist or cannot be accessed: {file_info.get('error_message', 'Unknown error')}" }, "Transcription failed: File inaccessible", gr.update(visible=False) try: # Process language parameter lang = None if language == "auto" else language # Call transcription tool result = asyncio.run(transcribe_audio_file_tool( audio_file_path=audio_path, model_size=model_size, language=lang, output_format=output_format, enable_speaker_diarization=enable_speaker )) # Prepare status information if result.get("processing_status") == "success": status_text = f"""āœ… Transcription completed! šŸ“ Generated files: {len(result.get('saved_files', []))} files šŸŽµ Audio duration: {result.get('audio_duration', 0):.2f} seconds šŸ“ Transcription segments: {result.get('segment_count', 0)} segments šŸŽÆ Model used: {result.get('model_used', 'N/A')} šŸŽ­ Speaker diarization: {'Enabled' if result.get('speaker_diarization_enabled', False) else 'Disabled'}""" # Show speaker information speaker_visible = result.get('speaker_diarization_enabled', False) and result.get('global_speaker_count', 0) > 0 speaker_info = result.get('speaker_summary', {}) if speaker_visible else {} return result, status_text, gr.update(visible=speaker_visible, value=speaker_info) else: error_msg = result.get('error_message', 'Unknown error') return result, f"āŒ Transcription failed: {error_msg}", gr.update(visible=False) except Exception as e: return { "error": f"Exception occurred during transcription: {str(e)}" }, f"āŒ Transcription exception: {str(e)}", gr.update(visible=False) finally: # Restore original HF_TOKEN restore_hf_token(original_token) # Bind transcription button transcribe_btn.click( perform_transcription, inputs=[ audio_file_input, model_size_choice, language_choice, output_format_choice, enable_speaker_separation, hf_token_input_transcribe ], outputs=[ transcribe_result_output, transcribe_status, speaker_info_output ] ) # ==================== MP3 File Management Tab ==================== with gr.Tab("MP3 File Management"): gr.Markdown("### šŸŽµ MP3 File Management") # Get environment-specific directories available_dirs, default_dir = get_default_directories() # Display environment info import pathlib is_modal = os.environ.get("MODAL_ENVIRONMENT") == "1" or os.path.exists("/modal") is_docker = os.path.exists("/.dockerenv") current_dir = pathlib.Path.cwd() if is_modal: env_info = "šŸš€ **Modal Environment Detected** - Using Modal cache directories" elif is_docker: env_info = "🐳 **Docker Environment Detected** - Using container directories" else: env_info = f"šŸ’» **Local Environment Detected** - Using current directory: `{current_dir}`" gr.Markdown(env_info) with gr.Row(): with gr.Column(scale=3): # Flexible directory path input custom_dir_input = gr.Textbox( label="Custom Directory Path", placeholder="Enter custom directory path (e.g., /path/to/your/audio/files)", lines=1, value=default_dir ) with gr.Column(scale=2): # Quick select for environment-specific directories quick_select = gr.Dropdown( label="Quick Select", choices=available_dirs, value=default_dir, info="Select directories based on current environment" ) with gr.Column(scale=1): scan_btn = gr.Button("šŸ” Scan Directory", variant="primary") file_list = gr.Textbox( label="MP3 File List", interactive=False, lines=10, max_lines=20, show_copy_button=True, autoscroll=True ) def list_mp3_files(directory): """List MP3 files in directory""" if not directory or not directory.strip(): return "Please enter a directory path" try: result = asyncio.run(get_mp3_files_tool(directory.strip())) # Check if there's an error if "error_message" in result: return f"āŒ Error scanning directory: {result['error_message']}" # Get file list total_files = result.get('total_files', 0) file_list = result.get('file_list', []) scanned_directory = result.get('scanned_directory', directory) if total_files == 0: return f"šŸ“‚ No MP3 files found in: {scanned_directory}" # Format file list for display display_lines = [ f"šŸ“‚ Found {total_files} MP3 file{'s' if total_files != 1 else ''} in: {scanned_directory}", "=" * 60 ] for i, file_info in enumerate(file_list, 1): filename = file_info.get('filename', 'Unknown') size_mb = file_info.get('file_size_mb', 0) created_time = file_info.get('created_time', 'Unknown') full_path = file_info.get('full_path', 'Unknown') display_lines.append( f"{i:2d}. šŸ“„ {filename}\n" f" šŸ’¾ Size: {size_mb:.2f} MB\n" f" šŸ“… Created: {created_time}\n" f" šŸ“ Path: {full_path}" ) return "\n".join(display_lines) except Exception as e: return f"āŒ Exception occurred while scanning directory: {str(e)}" def use_quick_select(selected_path): """Use quick select path and auto-scan""" if selected_path: return selected_path, list_mp3_files(selected_path) return "", "" def scan_directory(custom_path, quick_path): """Scan the directory based on custom input or quick select""" directory = custom_path.strip() if custom_path.strip() else quick_path return list_mp3_files(directory) # Bind callback functions quick_select.change( use_quick_select, inputs=[quick_select], outputs=[custom_dir_input, file_list] ) scan_btn.click( scan_directory, inputs=[custom_dir_input, quick_select], outputs=[file_list] ) # Auto-scan when custom directory is entered custom_dir_input.change( lambda x: list_mp3_files(x) if x.strip() else "", inputs=[custom_dir_input], outputs=[file_list] ) # ==================== Transcription Text Management Tab ==================== with gr.Tab("Transcription Text Management"): gr.Markdown("### šŸ“ Transcription Text File Management") gr.Markdown("View TXT and SRT files generated from audio transcription") # File path input file_path_input = gr.Textbox( label="File Path", placeholder="Enter path to TXT or SRT file to read", lines=1 ) # Load button load_file_btn = gr.Button("šŸ“‚ Load File", variant="primary") # Text content viewer content_editor = gr.Textbox( label="File Content", placeholder="File content will be displayed here after loading...", lines=25, max_lines=40, show_copy_button=True, interactive=False ) # Status information status_output = gr.Textbox( label="Status", interactive=False, lines=2 ) def load_and_display_file(file_path): """Load and display complete file content""" if not file_path.strip(): return "Please enter a file path", "āŒ No file path provided" try: # Get file info first info = asyncio.run(get_file_info_tool(file_path)) if info["status"] != "success": return "", f"āŒ Error: {info.get('error_message', 'Unknown error')}" # Check file size (warn for very large files) file_size_mb = info.get('file_size_mb', 0) if file_size_mb > 10: # Warn for files larger than 10MB return "", f"āš ļø File is too large ({file_size_mb:.2f} MB). Please use a smaller file for viewing." # Read entire file content with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Status message status = f"āœ… File loaded successfully: {info.get('filename', 'Unknown')}\nšŸ“ Size: {file_size_mb:.2f} MB" return content, status except UnicodeDecodeError: return "", "āŒ Error: File contains non-text content or encoding is not UTF-8" except FileNotFoundError: return "", "āŒ Error: File not found" except PermissionError: return "", "āŒ Error: Permission denied to read file" except Exception as e: return "", f"āŒ Error: {str(e)}" # Bind event handler load_file_btn.click( load_and_display_file, inputs=[file_path_input], outputs=[content_editor, status_output] ) return demo