ModalTranscriberMCP / src /ui /gradio_ui.py
richard-su's picture
Upload folder using huggingface_hub
e98f763 verified
"""
Gradio interface module
Contains all UI components and interface logic
"""
import gradio as gr
import asyncio
import os
from ..tools import mcp_tools
from ..tools.download_tools import get_file_info_tool, get_mp3_files_tool, read_text_file_segments_tool
from ..tools.transcription_tools import transcribe_audio_file_tool
def write_text_file_content(file_path: str, content: str, mode: str = "w", position: int = None):
"""Simple text file writing function"""
try:
if mode == "r+" and position is not None:
with open(file_path, mode, encoding='utf-8') as f:
f.seek(position)
characters_written = f.write(content)
else:
with open(file_path, mode, encoding='utf-8') as f:
characters_written = f.write(content)
return {
"status": "success",
"characters_written": characters_written,
"operation_type": mode,
"size_change": len(content)
}
except Exception as e:
return {
"status": "failed",
"error_message": str(e)
}
def temporarily_set_hf_token(hf_token: str):
"""Temporarily set HF_TOKEN in environment"""
original_token = os.environ.get("HF_TOKEN")
if hf_token and hf_token.strip():
os.environ["HF_TOKEN"] = hf_token.strip()
print(f"πŸ”‘ Using user-provided HF_TOKEN: {hf_token[:10]}...")
return original_token
def restore_hf_token(original_token: str):
"""Restore original HF_TOKEN in environment"""
if original_token is not None:
os.environ["HF_TOKEN"] = original_token
elif "HF_TOKEN" in os.environ:
del os.environ["HF_TOKEN"]
def get_default_directories():
"""Get default directories based on current environment"""
import pathlib
# Detect environment
is_modal = os.environ.get("MODAL_ENVIRONMENT") == "1" or os.path.exists("/modal")
is_docker = os.path.exists("/.dockerenv")
current_dir = pathlib.Path.cwd()
# Base directories
base_dirs = []
if is_modal:
# Modal environment - use cache directories
base_dirs.extend([
"/root/cache/apple_podcasts",
"/root/cache/xyz_podcasts",
"/tmp/downloads"
])
elif is_docker:
# Docker environment
base_dirs.extend([
"/app/downloads",
"/data/downloads",
"/tmp/downloads"
])
else:
# Local environment - use current directory and common locations
base_dirs.extend([
str(current_dir / "downloads"),
str(current_dir / "cache" / "apple_podcasts"),
str(current_dir / "cache" / "xyz_podcasts"),
"~/Downloads",
"~/Music"
])
# Add common directories
base_dirs.extend(["/tmp", "."])
# Filter out duplicates while preserving order
seen = set()
unique_dirs = []
for d in base_dirs:
if d not in seen:
seen.add(d)
unique_dirs.append(d)
# Determine default directory
default_dir = unique_dirs[0] if unique_dirs else str(current_dir / "downloads")
return unique_dirs, default_dir
def create_gradio_interface():
"""Create Gradio interface
Returns:
gr.Blocks: Configured Gradio interface
"""
with gr.Blocks(title="ModalTranscriberMCP") as demo:
gr.Markdown("# πŸŽ™οΈ ModalTranscriberMCP")
gr.Markdown("**Advanced Audio Transcription with Modal Cloud Computing & MCP Integration**")
# Performance Highlight
gr.Markdown("""
### ⚑ **Supercharged by Modal Serverless GPU**
πŸš€ **10-20x Faster Processing**: 1-hour audio transcribed in just 3-6 minutes
🎯 **Parallel GPU Processing**: Up to 10 concurrent GPU containers
☁️ **Zero Infrastructure Management**: Fully serverless, pay-per-use
""", elem_classes=["performance-highlight"])
# MCP Usage Instructions
gr.Markdown("""
### πŸ“š How to Use This MCP Server
This application provides both a **Web UI** and **MCP (Model Context Protocol) Tools** for AI assistants
A youtube video demo is below:
[![YouTube Video](https://img.youtube.com/vi/Ut5jw7Epb0o/0.jpg)](https://youtu.be/Ut5jw7Epb0o)
#### 🌐 Web Interface
- Use the tabs above to download podcasts and transcribe audio files
- Support for multi-platform downloads (Apple Podcasts, XiaoYuZhou)
- Advanced features: speaker diarization (requires Hugging Face Token), multiple output formats
- **High-speed processing**: Powered by Modal's distributed GPU infrastructure
#### πŸ€– MCP Integration
**For AI Assistants (Claude, etc.):**
```
MCP Server URL: /api/mcp
Available Tools:
β€’ transcribe_audio_file_tool - High-quality audio transcription
β€’ download_apple_podcast_tool - Apple Podcasts audio download
β€’ download_xyz_podcast_tool - XiaoYuZhou podcast download
β€’ get_mp3_files_tool - Scan directories for audio files
β€’ get_file_info_tool - Get file information
β€’ read_text_file_segments_tool - Read large text files in chunks
```
**Connect via MCP Client:**
we deploy mcp server on modal, and we can use the url to connect to the gradio ui
the url is: http://richardsucran--gradio-mcp-ui-app-entry.modal.run
and json config is:
```json
{
"mcpServers": {
"podcast-mcp": {
"url": "http://richardsucran--gradio-mcp-ui-app-entry.modal.run/api/mcp"
}
}
}
```
""", elem_classes=["mcp-instructions"])
# ==================== Podcast Download Tab ====================
with gr.Tab("Podcast Download"):
gr.Markdown("### πŸŽ™οΈ Download Podcast Audio")
url_input = gr.Textbox(
label="Podcast Link",
placeholder="Enter podcast page URL",
lines=1
)
platform_choice = gr.Radio(
choices=["Apple Podcast", "XiaoYuZhou"],
label="Select Podcast Platform",
value="Apple Podcast"
)
# Transcription options
with gr.Row():
auto_transcribe = gr.Checkbox(
label="Auto-transcribe after download",
value=True,
info="Start transcription immediately after download"
)
enable_speaker_diarization = gr.Checkbox(
label="Enable speaker diarization",
value=False,
info="Identify different speakers (requires Hugging Face Token)"
)
# HF Token input for speaker diarization
hf_token_input_download = gr.Textbox(
label="Hugging Face Token (Optional)",
placeholder="Enter your HF token here to override environment variable",
type="password",
info="Required for speaker diarization. If provided, will override HF_TOKEN environment variable."
)
download_btn = gr.Button("πŸ“₯ Start Download", variant="primary")
result_output = gr.JSON(label="Download Results")
async def download_podcast_and_transcribe(url, platform, auto_transcribe, enable_speaker, hf_token):
"""Call corresponding download tool based on selected platform"""
# Temporarily set HF_TOKEN if provided
original_token = temporarily_set_hf_token(hf_token)
try:
if platform == "Apple Podcast":
download_result = await mcp_tools.download_apple_podcast(url)
else:
download_result = await mcp_tools.download_xyz_podcast(url)
# 2. Check if download was successful
if download_result["status"] != "success":
return {
"download_status": "failed",
"error_message": download_result.get("error_message", "Download failed"),
"transcription_status": "not_started"
}
# 3. If not auto-transcribing, return only download results
if not auto_transcribe:
return {
"download_status": "success",
"audio_file": download_result["audio_file_path"],
"transcription_status": "skipped (user chose not to auto-transcribe)"
}
# 4. Start transcription
try:
audio_path = download_result["audio_file_path"]
print(f"Transcribing audio file: {audio_path}")
transcribe_result = await mcp_tools.transcribe_audio_file(
audio_path,
model_size="turbo",
language=None,
output_format="srt",
enable_speaker_diarization=enable_speaker
)
# 5. Merge results
result = {
"download_status": "success",
"audio_file": audio_path,
"transcription_status": "success",
"txt_file_path": transcribe_result.get("txt_file_path"),
"srt_file_path": transcribe_result.get("srt_file_path"),
"transcription_details": {
"model_used": transcribe_result.get("model_used"),
"segment_count": transcribe_result.get("segment_count"),
"audio_duration": transcribe_result.get("audio_duration"),
"saved_files": transcribe_result.get("saved_files", []),
"speaker_diarization_enabled": transcribe_result.get("speaker_diarization_enabled", False)
}
}
# 6. Add speaker diarization info if enabled
if enable_speaker and transcribe_result.get("speaker_diarization_enabled", False):
result["speaker_diarization"] = {
"global_speaker_count": transcribe_result.get("global_speaker_count", 0),
"speaker_summary": transcribe_result.get("speaker_summary", {})
}
return result
except Exception as e:
return {
"download_status": "success",
"audio_file": download_result["audio_file_path"],
"transcription_status": "failed",
"error_message": str(e)
}
finally:
# Restore original HF_TOKEN
restore_hf_token(original_token)
# Bind callback function
download_btn.click(
download_podcast_and_transcribe,
inputs=[url_input, platform_choice, auto_transcribe, enable_speaker_diarization, hf_token_input_download],
outputs=result_output
)
# ==================== Audio Transcription Tab ====================
with gr.Tab("Audio Transcription"):
gr.Markdown("### 🎀 Audio Transcription and Speaker Diarization")
gr.Markdown("Upload audio files for high-quality transcription with speaker diarization support")
with gr.Row():
with gr.Column(scale=2):
# Audio file input
audio_file_input = gr.Textbox(
label="Audio File Path",
placeholder="Enter complete path to audio file (supports mp3, wav, m4a, etc.)",
lines=1
)
# Transcription parameter settings
with gr.Row():
model_size_choice = gr.Dropdown(
choices=["tiny", "base", "small", "medium", "large", "turbo"],
value="turbo",
label="Model Size",
info="Affects transcription accuracy and speed"
)
language_choice = gr.Dropdown(
choices=["auto", "zh", "en", "ja", "ko", "fr", "de", "es"],
value="auto",
label="Language",
info="auto=auto-detect"
)
with gr.Row():
with gr.Column():
output_format_choice = gr.Radio(
choices=["srt", "txt", "json"],
value="srt",
label="Output Format"
)
with gr.Column():
enable_speaker_separation = gr.Checkbox(
label="Enable speaker diarization",
value=False,
info="Requires Hugging Face Token"
)
# HF Token input for speaker diarization
hf_token_input_transcribe = gr.Textbox(
label="Hugging Face Token (Optional)",
placeholder="Enter your HF token here to override environment variable",
type="password",
info="Required for speaker diarization. If provided, will override HF_TOKEN environment variable."
)
transcribe_btn = gr.Button("🎀 Start Transcription", variant="primary", size="lg")
with gr.Column(scale=1):
# Audio file information
audio_info_output = gr.JSON(label="Audio File Information", visible=False)
# Transcription progress and status
transcribe_status = gr.Textbox(
label="Transcription Status",
value="Waiting to start transcription...",
interactive=False,
lines=3
)
# Transcription results display
transcribe_result_output = gr.JSON(
label="Transcription Results",
visible=True
)
# Speaker diarization results (if enabled)
speaker_info_output = gr.JSON(
label="Speaker Diarization Information",
visible=False
)
def perform_transcription(audio_path, model_size, language, output_format, enable_speaker, hf_token):
"""Execute audio transcription"""
if not audio_path.strip():
return {
"error": "Please enter audio file path"
}, "Transcription failed: No audio file selected", gr.update(visible=False)
# Temporarily set HF_TOKEN if provided
original_token = temporarily_set_hf_token(hf_token)
try:
# Check if file exists
import asyncio
file_info = asyncio.run(get_file_info_tool(audio_path))
if file_info["status"] != "success":
return {
"error": f"File does not exist or cannot be accessed: {file_info.get('error_message', 'Unknown error')}"
}, "Transcription failed: File inaccessible", gr.update(visible=False)
try:
# Process language parameter
lang = None if language == "auto" else language
# Call transcription tool
result = asyncio.run(transcribe_audio_file_tool(
audio_file_path=audio_path,
model_size=model_size,
language=lang,
output_format=output_format,
enable_speaker_diarization=enable_speaker
))
# Prepare status information
if result.get("processing_status") == "success":
status_text = f"""βœ… Transcription completed!
πŸ“ Generated files: {len(result.get('saved_files', []))} files
🎡 Audio duration: {result.get('audio_duration', 0):.2f} seconds
πŸ“ Transcription segments: {result.get('segment_count', 0)} segments
🎯 Model used: {result.get('model_used', 'N/A')}
🎭 Speaker diarization: {'Enabled' if result.get('speaker_diarization_enabled', False) else 'Disabled'}"""
# Show speaker information
speaker_visible = result.get('speaker_diarization_enabled', False) and result.get('global_speaker_count', 0) > 0
speaker_info = result.get('speaker_summary', {}) if speaker_visible else {}
return result, status_text, gr.update(visible=speaker_visible, value=speaker_info)
else:
error_msg = result.get('error_message', 'Unknown error')
return result, f"❌ Transcription failed: {error_msg}", gr.update(visible=False)
except Exception as e:
return {
"error": f"Exception occurred during transcription: {str(e)}"
}, f"❌ Transcription exception: {str(e)}", gr.update(visible=False)
finally:
# Restore original HF_TOKEN
restore_hf_token(original_token)
# Bind transcription button
transcribe_btn.click(
perform_transcription,
inputs=[
audio_file_input,
model_size_choice,
language_choice,
output_format_choice,
enable_speaker_separation,
hf_token_input_transcribe
],
outputs=[
transcribe_result_output,
transcribe_status,
speaker_info_output
]
)
# ==================== MP3 File Management Tab ====================
with gr.Tab("MP3 File Management"):
gr.Markdown("### 🎡 MP3 File Management")
# Get environment-specific directories
available_dirs, default_dir = get_default_directories()
# Display environment info
import pathlib
is_modal = os.environ.get("MODAL_ENVIRONMENT") == "1" or os.path.exists("/modal")
is_docker = os.path.exists("/.dockerenv")
current_dir = pathlib.Path.cwd()
if is_modal:
env_info = "πŸš€ **Modal Environment Detected** - Using Modal cache directories"
elif is_docker:
env_info = "🐳 **Docker Environment Detected** - Using container directories"
else:
env_info = f"πŸ’» **Local Environment Detected** - Using current directory: `{current_dir}`"
gr.Markdown(env_info)
with gr.Row():
with gr.Column(scale=3):
# Flexible directory path input
custom_dir_input = gr.Textbox(
label="Custom Directory Path",
placeholder="Enter custom directory path (e.g., /path/to/your/audio/files)",
lines=1,
value=default_dir
)
with gr.Column(scale=2):
# Quick select for environment-specific directories
quick_select = gr.Dropdown(
label="Quick Select",
choices=available_dirs,
value=default_dir,
info="Select directories based on current environment"
)
with gr.Column(scale=1):
scan_btn = gr.Button("πŸ” Scan Directory", variant="primary")
file_list = gr.Textbox(
label="MP3 File List",
interactive=False,
lines=10,
max_lines=20,
show_copy_button=True,
autoscroll=True
)
def list_mp3_files(directory):
"""List MP3 files in directory"""
if not directory or not directory.strip():
return "Please enter a directory path"
try:
result = asyncio.run(get_mp3_files_tool(directory.strip()))
# Check if there's an error
if "error_message" in result:
return f"❌ Error scanning directory: {result['error_message']}"
# Get file list
total_files = result.get('total_files', 0)
file_list = result.get('file_list', [])
scanned_directory = result.get('scanned_directory', directory)
if total_files == 0:
return f"πŸ“‚ No MP3 files found in: {scanned_directory}"
# Format file list for display
display_lines = [
f"πŸ“‚ Found {total_files} MP3 file{'s' if total_files != 1 else ''} in: {scanned_directory}",
"=" * 60
]
for i, file_info in enumerate(file_list, 1):
filename = file_info.get('filename', 'Unknown')
size_mb = file_info.get('file_size_mb', 0)
created_time = file_info.get('created_time', 'Unknown')
full_path = file_info.get('full_path', 'Unknown')
display_lines.append(
f"{i:2d}. πŸ“„ {filename}\n"
f" πŸ’Ύ Size: {size_mb:.2f} MB\n"
f" πŸ“… Created: {created_time}\n"
f" πŸ“ Path: {full_path}"
)
return "\n".join(display_lines)
except Exception as e:
return f"❌ Exception occurred while scanning directory: {str(e)}"
def use_quick_select(selected_path):
"""Use quick select path and auto-scan"""
if selected_path:
return selected_path, list_mp3_files(selected_path)
return "", ""
def scan_directory(custom_path, quick_path):
"""Scan the directory based on custom input or quick select"""
directory = custom_path.strip() if custom_path.strip() else quick_path
return list_mp3_files(directory)
# Bind callback functions
quick_select.change(
use_quick_select,
inputs=[quick_select],
outputs=[custom_dir_input, file_list]
)
scan_btn.click(
scan_directory,
inputs=[custom_dir_input, quick_select],
outputs=[file_list]
)
# Auto-scan when custom directory is entered
custom_dir_input.change(
lambda x: list_mp3_files(x) if x.strip() else "",
inputs=[custom_dir_input],
outputs=[file_list]
)
# ==================== Transcription Text Management Tab ====================
with gr.Tab("Transcription Text Management"):
gr.Markdown("### πŸ“ Transcription Text File Management")
gr.Markdown("View TXT and SRT files generated from audio transcription")
# File path input
file_path_input = gr.Textbox(
label="File Path",
placeholder="Enter path to TXT or SRT file to read",
lines=1
)
# Load button
load_file_btn = gr.Button("πŸ“‚ Load File", variant="primary")
# Text content viewer
content_editor = gr.Textbox(
label="File Content",
placeholder="File content will be displayed here after loading...",
lines=25,
max_lines=40,
show_copy_button=True,
interactive=False
)
# Status information
status_output = gr.Textbox(
label="Status",
interactive=False,
lines=2
)
def load_and_display_file(file_path):
"""Load and display complete file content"""
if not file_path.strip():
return "Please enter a file path", "❌ No file path provided"
try:
# Get file info first
info = asyncio.run(get_file_info_tool(file_path))
if info["status"] != "success":
return "", f"❌ Error: {info.get('error_message', 'Unknown error')}"
# Check file size (warn for very large files)
file_size_mb = info.get('file_size_mb', 0)
if file_size_mb > 10: # Warn for files larger than 10MB
return "", f"⚠️ File is too large ({file_size_mb:.2f} MB). Please use a smaller file for viewing."
# Read entire file content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Status message
status = f"βœ… File loaded successfully: {info.get('filename', 'Unknown')}\nπŸ“ Size: {file_size_mb:.2f} MB"
return content, status
except UnicodeDecodeError:
return "", "❌ Error: File contains non-text content or encoding is not UTF-8"
except FileNotFoundError:
return "", "❌ Error: File not found"
except PermissionError:
return "", "❌ Error: Permission denied to read file"
except Exception as e:
return "", f"❌ Error: {str(e)}"
# Bind event handler
load_file_btn.click(
load_and_display_file,
inputs=[file_path_input],
outputs=[content_editor, status_output]
)
return demo