|
""" |
|
Gradio interface module |
|
Contains all UI components and interface logic |
|
""" |
|
|
|
import gradio as gr |
|
import asyncio |
|
import os |
|
from ..tools import mcp_tools |
|
from ..tools.download_tools import get_file_info_tool, get_mp3_files_tool, read_text_file_segments_tool |
|
from ..tools.transcription_tools import transcribe_audio_file_tool |
|
|
|
def write_text_file_content(file_path: str, content: str, mode: str = "w", position: int = None): |
|
"""Simple text file writing function""" |
|
try: |
|
if mode == "r+" and position is not None: |
|
with open(file_path, mode, encoding='utf-8') as f: |
|
f.seek(position) |
|
characters_written = f.write(content) |
|
else: |
|
with open(file_path, mode, encoding='utf-8') as f: |
|
characters_written = f.write(content) |
|
|
|
return { |
|
"status": "success", |
|
"characters_written": characters_written, |
|
"operation_type": mode, |
|
"size_change": len(content) |
|
} |
|
except Exception as e: |
|
return { |
|
"status": "failed", |
|
"error_message": str(e) |
|
} |
|
|
|
def temporarily_set_hf_token(hf_token: str): |
|
"""Temporarily set HF_TOKEN in environment""" |
|
original_token = os.environ.get("HF_TOKEN") |
|
if hf_token and hf_token.strip(): |
|
os.environ["HF_TOKEN"] = hf_token.strip() |
|
print(f"π Using user-provided HF_TOKEN: {hf_token[:10]}...") |
|
return original_token |
|
|
|
def restore_hf_token(original_token: str): |
|
"""Restore original HF_TOKEN in environment""" |
|
if original_token is not None: |
|
os.environ["HF_TOKEN"] = original_token |
|
elif "HF_TOKEN" in os.environ: |
|
del os.environ["HF_TOKEN"] |
|
|
|
def get_default_directories(): |
|
"""Get default directories based on current environment""" |
|
import pathlib |
|
|
|
|
|
is_modal = os.environ.get("MODAL_ENVIRONMENT") == "1" or os.path.exists("/modal") |
|
is_docker = os.path.exists("/.dockerenv") |
|
current_dir = pathlib.Path.cwd() |
|
|
|
|
|
base_dirs = [] |
|
|
|
if is_modal: |
|
|
|
base_dirs.extend([ |
|
"/root/cache/apple_podcasts", |
|
"/root/cache/xyz_podcasts", |
|
"/tmp/downloads" |
|
]) |
|
elif is_docker: |
|
|
|
base_dirs.extend([ |
|
"/app/downloads", |
|
"/data/downloads", |
|
"/tmp/downloads" |
|
]) |
|
else: |
|
|
|
base_dirs.extend([ |
|
str(current_dir / "downloads"), |
|
str(current_dir / "cache" / "apple_podcasts"), |
|
str(current_dir / "cache" / "xyz_podcasts"), |
|
"~/Downloads", |
|
"~/Music" |
|
]) |
|
|
|
|
|
base_dirs.extend(["/tmp", "."]) |
|
|
|
|
|
seen = set() |
|
unique_dirs = [] |
|
for d in base_dirs: |
|
if d not in seen: |
|
seen.add(d) |
|
unique_dirs.append(d) |
|
|
|
|
|
default_dir = unique_dirs[0] if unique_dirs else str(current_dir / "downloads") |
|
|
|
return unique_dirs, default_dir |
|
|
|
def create_gradio_interface(): |
|
"""Create Gradio interface |
|
|
|
Returns: |
|
gr.Blocks: Configured Gradio interface |
|
""" |
|
|
|
with gr.Blocks(title="ModalTranscriberMCP") as demo: |
|
gr.Markdown("# ποΈ ModalTranscriberMCP") |
|
gr.Markdown("**Advanced Audio Transcription with Modal Cloud Computing & MCP Integration**") |
|
|
|
|
|
gr.Markdown(""" |
|
### β‘ **Supercharged by Modal Serverless GPU** |
|
π **10-20x Faster Processing**: 1-hour audio transcribed in just 3-6 minutes |
|
π― **Parallel GPU Processing**: Up to 10 concurrent GPU containers |
|
βοΈ **Zero Infrastructure Management**: Fully serverless, pay-per-use |
|
""", elem_classes=["performance-highlight"]) |
|
|
|
|
|
gr.Markdown(""" |
|
### π How to Use This MCP Server |
|
|
|
This application provides both a **Web UI** and **MCP (Model Context Protocol) Tools** for AI assistants |
|
|
|
A youtube video demo is below: |
|
|
|
[](https://youtu.be/Ut5jw7Epb0o) |
|
|
|
|
|
|
|
#### π Web Interface |
|
- Use the tabs above to download podcasts and transcribe audio files |
|
- Support for multi-platform downloads (Apple Podcasts, XiaoYuZhou) |
|
- Advanced features: speaker diarization (requires Hugging Face Token), multiple output formats |
|
- **High-speed processing**: Powered by Modal's distributed GPU infrastructure |
|
|
|
#### π€ MCP Integration |
|
**For AI Assistants (Claude, etc.):** |
|
``` |
|
MCP Server URL: /api/mcp |
|
Available Tools: |
|
β’ transcribe_audio_file_tool - High-quality audio transcription |
|
β’ download_apple_podcast_tool - Apple Podcasts audio download |
|
β’ download_xyz_podcast_tool - XiaoYuZhou podcast download |
|
β’ get_mp3_files_tool - Scan directories for audio files |
|
β’ get_file_info_tool - Get file information |
|
β’ read_text_file_segments_tool - Read large text files in chunks |
|
``` |
|
|
|
**Connect via MCP Client:** |
|
|
|
we deploy mcp server on modal, and we can use the url to connect to the gradio ui |
|
the url is: http://richardsucran--gradio-mcp-ui-app-entry.modal.run |
|
and json config is: |
|
```json |
|
{ |
|
"mcpServers": { |
|
"podcast-mcp": { |
|
"url": "http://richardsucran--gradio-mcp-ui-app-entry.modal.run/api/mcp" |
|
} |
|
} |
|
} |
|
``` |
|
""", elem_classes=["mcp-instructions"]) |
|
|
|
|
|
with gr.Tab("Podcast Download"): |
|
gr.Markdown("### ποΈ Download Podcast Audio") |
|
|
|
url_input = gr.Textbox( |
|
label="Podcast Link", |
|
placeholder="Enter podcast page URL", |
|
lines=1 |
|
) |
|
|
|
platform_choice = gr.Radio( |
|
choices=["Apple Podcast", "XiaoYuZhou"], |
|
label="Select Podcast Platform", |
|
value="Apple Podcast" |
|
) |
|
|
|
|
|
with gr.Row(): |
|
auto_transcribe = gr.Checkbox( |
|
label="Auto-transcribe after download", |
|
value=True, |
|
info="Start transcription immediately after download" |
|
) |
|
enable_speaker_diarization = gr.Checkbox( |
|
label="Enable speaker diarization", |
|
value=False, |
|
info="Identify different speakers (requires Hugging Face Token)" |
|
) |
|
|
|
|
|
hf_token_input_download = gr.Textbox( |
|
label="Hugging Face Token (Optional)", |
|
placeholder="Enter your HF token here to override environment variable", |
|
type="password", |
|
info="Required for speaker diarization. If provided, will override HF_TOKEN environment variable." |
|
) |
|
|
|
download_btn = gr.Button("π₯ Start Download", variant="primary") |
|
result_output = gr.JSON(label="Download Results") |
|
|
|
async def download_podcast_and_transcribe(url, platform, auto_transcribe, enable_speaker, hf_token): |
|
"""Call corresponding download tool based on selected platform""" |
|
|
|
original_token = temporarily_set_hf_token(hf_token) |
|
|
|
try: |
|
if platform == "Apple Podcast": |
|
download_result = await mcp_tools.download_apple_podcast(url) |
|
else: |
|
download_result = await mcp_tools.download_xyz_podcast(url) |
|
|
|
|
|
if download_result["status"] != "success": |
|
return { |
|
"download_status": "failed", |
|
"error_message": download_result.get("error_message", "Download failed"), |
|
"transcription_status": "not_started" |
|
} |
|
|
|
|
|
if not auto_transcribe: |
|
return { |
|
"download_status": "success", |
|
"audio_file": download_result["audio_file_path"], |
|
"transcription_status": "skipped (user chose not to auto-transcribe)" |
|
} |
|
|
|
|
|
try: |
|
audio_path = download_result["audio_file_path"] |
|
print(f"Transcribing audio file: {audio_path}") |
|
transcribe_result = await mcp_tools.transcribe_audio_file( |
|
audio_path, |
|
model_size="turbo", |
|
language=None, |
|
output_format="srt", |
|
enable_speaker_diarization=enable_speaker |
|
) |
|
|
|
|
|
result = { |
|
"download_status": "success", |
|
"audio_file": audio_path, |
|
"transcription_status": "success", |
|
"txt_file_path": transcribe_result.get("txt_file_path"), |
|
"srt_file_path": transcribe_result.get("srt_file_path"), |
|
"transcription_details": { |
|
"model_used": transcribe_result.get("model_used"), |
|
"segment_count": transcribe_result.get("segment_count"), |
|
"audio_duration": transcribe_result.get("audio_duration"), |
|
"saved_files": transcribe_result.get("saved_files", []), |
|
"speaker_diarization_enabled": transcribe_result.get("speaker_diarization_enabled", False) |
|
} |
|
} |
|
|
|
|
|
if enable_speaker and transcribe_result.get("speaker_diarization_enabled", False): |
|
result["speaker_diarization"] = { |
|
"global_speaker_count": transcribe_result.get("global_speaker_count", 0), |
|
"speaker_summary": transcribe_result.get("speaker_summary", {}) |
|
} |
|
|
|
return result |
|
|
|
except Exception as e: |
|
return { |
|
"download_status": "success", |
|
"audio_file": download_result["audio_file_path"], |
|
"transcription_status": "failed", |
|
"error_message": str(e) |
|
} |
|
|
|
finally: |
|
|
|
restore_hf_token(original_token) |
|
|
|
|
|
download_btn.click( |
|
download_podcast_and_transcribe, |
|
inputs=[url_input, platform_choice, auto_transcribe, enable_speaker_diarization, hf_token_input_download], |
|
outputs=result_output |
|
) |
|
|
|
|
|
with gr.Tab("Audio Transcription"): |
|
gr.Markdown("### π€ Audio Transcription and Speaker Diarization") |
|
gr.Markdown("Upload audio files for high-quality transcription with speaker diarization support") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
|
|
audio_file_input = gr.Textbox( |
|
label="Audio File Path", |
|
placeholder="Enter complete path to audio file (supports mp3, wav, m4a, etc.)", |
|
lines=1 |
|
) |
|
|
|
|
|
with gr.Row(): |
|
model_size_choice = gr.Dropdown( |
|
choices=["tiny", "base", "small", "medium", "large", "turbo"], |
|
value="turbo", |
|
label="Model Size", |
|
info="Affects transcription accuracy and speed" |
|
) |
|
language_choice = gr.Dropdown( |
|
choices=["auto", "zh", "en", "ja", "ko", "fr", "de", "es"], |
|
value="auto", |
|
label="Language", |
|
info="auto=auto-detect" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
output_format_choice = gr.Radio( |
|
choices=["srt", "txt", "json"], |
|
value="srt", |
|
label="Output Format" |
|
) |
|
with gr.Column(): |
|
enable_speaker_separation = gr.Checkbox( |
|
label="Enable speaker diarization", |
|
value=False, |
|
info="Requires Hugging Face Token" |
|
) |
|
|
|
hf_token_input_transcribe = gr.Textbox( |
|
label="Hugging Face Token (Optional)", |
|
placeholder="Enter your HF token here to override environment variable", |
|
type="password", |
|
info="Required for speaker diarization. If provided, will override HF_TOKEN environment variable." |
|
) |
|
|
|
transcribe_btn = gr.Button("π€ Start Transcription", variant="primary", size="lg") |
|
|
|
with gr.Column(scale=1): |
|
|
|
audio_info_output = gr.JSON(label="Audio File Information", visible=False) |
|
|
|
|
|
transcribe_status = gr.Textbox( |
|
label="Transcription Status", |
|
value="Waiting to start transcription...", |
|
interactive=False, |
|
lines=3 |
|
) |
|
|
|
|
|
transcribe_result_output = gr.JSON( |
|
label="Transcription Results", |
|
visible=True |
|
) |
|
|
|
|
|
speaker_info_output = gr.JSON( |
|
label="Speaker Diarization Information", |
|
visible=False |
|
) |
|
|
|
def perform_transcription(audio_path, model_size, language, output_format, enable_speaker, hf_token): |
|
"""Execute audio transcription""" |
|
if not audio_path.strip(): |
|
return { |
|
"error": "Please enter audio file path" |
|
}, "Transcription failed: No audio file selected", gr.update(visible=False) |
|
|
|
|
|
original_token = temporarily_set_hf_token(hf_token) |
|
|
|
try: |
|
|
|
import asyncio |
|
file_info = asyncio.run(get_file_info_tool(audio_path)) |
|
if file_info["status"] != "success": |
|
return { |
|
"error": f"File does not exist or cannot be accessed: {file_info.get('error_message', 'Unknown error')}" |
|
}, "Transcription failed: File inaccessible", gr.update(visible=False) |
|
|
|
try: |
|
|
|
lang = None if language == "auto" else language |
|
|
|
|
|
result = asyncio.run(transcribe_audio_file_tool( |
|
audio_file_path=audio_path, |
|
model_size=model_size, |
|
language=lang, |
|
output_format=output_format, |
|
enable_speaker_diarization=enable_speaker |
|
)) |
|
|
|
|
|
if result.get("processing_status") == "success": |
|
status_text = f"""β
Transcription completed! |
|
π Generated files: {len(result.get('saved_files', []))} files |
|
π΅ Audio duration: {result.get('audio_duration', 0):.2f} seconds |
|
π Transcription segments: {result.get('segment_count', 0)} segments |
|
π― Model used: {result.get('model_used', 'N/A')} |
|
π Speaker diarization: {'Enabled' if result.get('speaker_diarization_enabled', False) else 'Disabled'}""" |
|
|
|
|
|
speaker_visible = result.get('speaker_diarization_enabled', False) and result.get('global_speaker_count', 0) > 0 |
|
speaker_info = result.get('speaker_summary', {}) if speaker_visible else {} |
|
|
|
return result, status_text, gr.update(visible=speaker_visible, value=speaker_info) |
|
else: |
|
error_msg = result.get('error_message', 'Unknown error') |
|
return result, f"β Transcription failed: {error_msg}", gr.update(visible=False) |
|
|
|
except Exception as e: |
|
return { |
|
"error": f"Exception occurred during transcription: {str(e)}" |
|
}, f"β Transcription exception: {str(e)}", gr.update(visible=False) |
|
|
|
finally: |
|
|
|
restore_hf_token(original_token) |
|
|
|
|
|
transcribe_btn.click( |
|
perform_transcription, |
|
inputs=[ |
|
audio_file_input, |
|
model_size_choice, |
|
language_choice, |
|
output_format_choice, |
|
enable_speaker_separation, |
|
hf_token_input_transcribe |
|
], |
|
outputs=[ |
|
transcribe_result_output, |
|
transcribe_status, |
|
speaker_info_output |
|
] |
|
) |
|
|
|
|
|
with gr.Tab("MP3 File Management"): |
|
gr.Markdown("### π΅ MP3 File Management") |
|
|
|
|
|
available_dirs, default_dir = get_default_directories() |
|
|
|
|
|
import pathlib |
|
is_modal = os.environ.get("MODAL_ENVIRONMENT") == "1" or os.path.exists("/modal") |
|
is_docker = os.path.exists("/.dockerenv") |
|
current_dir = pathlib.Path.cwd() |
|
|
|
if is_modal: |
|
env_info = "π **Modal Environment Detected** - Using Modal cache directories" |
|
elif is_docker: |
|
env_info = "π³ **Docker Environment Detected** - Using container directories" |
|
else: |
|
env_info = f"π» **Local Environment Detected** - Using current directory: `{current_dir}`" |
|
|
|
gr.Markdown(env_info) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
|
|
custom_dir_input = gr.Textbox( |
|
label="Custom Directory Path", |
|
placeholder="Enter custom directory path (e.g., /path/to/your/audio/files)", |
|
lines=1, |
|
value=default_dir |
|
) |
|
with gr.Column(scale=2): |
|
|
|
quick_select = gr.Dropdown( |
|
label="Quick Select", |
|
choices=available_dirs, |
|
value=default_dir, |
|
info="Select directories based on current environment" |
|
) |
|
with gr.Column(scale=1): |
|
scan_btn = gr.Button("π Scan Directory", variant="primary") |
|
|
|
file_list = gr.Textbox( |
|
label="MP3 File List", |
|
interactive=False, |
|
lines=10, |
|
max_lines=20, |
|
show_copy_button=True, |
|
autoscroll=True |
|
) |
|
|
|
def list_mp3_files(directory): |
|
"""List MP3 files in directory""" |
|
if not directory or not directory.strip(): |
|
return "Please enter a directory path" |
|
|
|
try: |
|
result = asyncio.run(get_mp3_files_tool(directory.strip())) |
|
|
|
|
|
if "error_message" in result: |
|
return f"β Error scanning directory: {result['error_message']}" |
|
|
|
|
|
total_files = result.get('total_files', 0) |
|
file_list = result.get('file_list', []) |
|
scanned_directory = result.get('scanned_directory', directory) |
|
|
|
if total_files == 0: |
|
return f"π No MP3 files found in: {scanned_directory}" |
|
|
|
|
|
display_lines = [ |
|
f"π Found {total_files} MP3 file{'s' if total_files != 1 else ''} in: {scanned_directory}", |
|
"=" * 60 |
|
] |
|
|
|
for i, file_info in enumerate(file_list, 1): |
|
filename = file_info.get('filename', 'Unknown') |
|
size_mb = file_info.get('file_size_mb', 0) |
|
created_time = file_info.get('created_time', 'Unknown') |
|
full_path = file_info.get('full_path', 'Unknown') |
|
|
|
display_lines.append( |
|
f"{i:2d}. π {filename}\n" |
|
f" πΎ Size: {size_mb:.2f} MB\n" |
|
f" π
Created: {created_time}\n" |
|
f" π Path: {full_path}" |
|
) |
|
|
|
return "\n".join(display_lines) |
|
|
|
except Exception as e: |
|
return f"β Exception occurred while scanning directory: {str(e)}" |
|
|
|
def use_quick_select(selected_path): |
|
"""Use quick select path and auto-scan""" |
|
if selected_path: |
|
return selected_path, list_mp3_files(selected_path) |
|
return "", "" |
|
|
|
def scan_directory(custom_path, quick_path): |
|
"""Scan the directory based on custom input or quick select""" |
|
directory = custom_path.strip() if custom_path.strip() else quick_path |
|
return list_mp3_files(directory) |
|
|
|
|
|
quick_select.change( |
|
use_quick_select, |
|
inputs=[quick_select], |
|
outputs=[custom_dir_input, file_list] |
|
) |
|
|
|
scan_btn.click( |
|
scan_directory, |
|
inputs=[custom_dir_input, quick_select], |
|
outputs=[file_list] |
|
) |
|
|
|
|
|
custom_dir_input.change( |
|
lambda x: list_mp3_files(x) if x.strip() else "", |
|
inputs=[custom_dir_input], |
|
outputs=[file_list] |
|
) |
|
|
|
|
|
with gr.Tab("Transcription Text Management"): |
|
gr.Markdown("### π Transcription Text File Management") |
|
gr.Markdown("View TXT and SRT files generated from audio transcription") |
|
|
|
|
|
file_path_input = gr.Textbox( |
|
label="File Path", |
|
placeholder="Enter path to TXT or SRT file to read", |
|
lines=1 |
|
) |
|
|
|
|
|
load_file_btn = gr.Button("π Load File", variant="primary") |
|
|
|
|
|
content_editor = gr.Textbox( |
|
label="File Content", |
|
placeholder="File content will be displayed here after loading...", |
|
lines=25, |
|
max_lines=40, |
|
show_copy_button=True, |
|
interactive=False |
|
) |
|
|
|
|
|
status_output = gr.Textbox( |
|
label="Status", |
|
interactive=False, |
|
lines=2 |
|
) |
|
|
|
def load_and_display_file(file_path): |
|
"""Load and display complete file content""" |
|
if not file_path.strip(): |
|
return "Please enter a file path", "β No file path provided" |
|
|
|
try: |
|
|
|
info = asyncio.run(get_file_info_tool(file_path)) |
|
|
|
if info["status"] != "success": |
|
return "", f"β Error: {info.get('error_message', 'Unknown error')}" |
|
|
|
|
|
file_size_mb = info.get('file_size_mb', 0) |
|
if file_size_mb > 10: |
|
return "", f"β οΈ File is too large ({file_size_mb:.2f} MB). Please use a smaller file for viewing." |
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
content = f.read() |
|
|
|
|
|
status = f"β
File loaded successfully: {info.get('filename', 'Unknown')}\nπ Size: {file_size_mb:.2f} MB" |
|
|
|
return content, status |
|
|
|
except UnicodeDecodeError: |
|
return "", "β Error: File contains non-text content or encoding is not UTF-8" |
|
except FileNotFoundError: |
|
return "", "β Error: File not found" |
|
except PermissionError: |
|
return "", "β Error: Permission denied to read file" |
|
except Exception as e: |
|
return "", f"β Error: {str(e)}" |
|
|
|
|
|
load_file_btn.click( |
|
load_and_display_file, |
|
inputs=[file_path_input], |
|
outputs=[content_editor, status_output] |
|
) |
|
|
|
return demo |