Spaces:

Agents-MCP-Hackathon
/

ModalTranscriberMCP

Running

App Files Files Community

richard-su commited on Jun 10

Commit

b5df735

verified ·

1 Parent(s): c5a4957

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/__init__.py +13 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/app.cpython-310.pyc +0 -0
src/adapters/__init__.py +13 -0
src/adapters/__pycache__/__init__.cpython-310.pyc +0 -0
src/adapters/__pycache__/local_adapter.cpython-310.pyc +0 -0
src/adapters/__pycache__/modal_adapter.cpython-310.pyc +0 -0
src/adapters/__pycache__/transcription_adapter_factory.cpython-310.pyc +0 -0
src/adapters/local_adapter.py +93 -0
src/adapters/modal_adapter.py +126 -0
src/adapters/transcription_adapter_factory.py +77 -0
src/api/__init__.py +5 -0
src/api/__pycache__/__init__.cpython-310.pyc +0 -0
src/api/__pycache__/transcription_api.cpython-310.pyc +0 -0
src/api/transcription_api.py +112 -0
src/app.py +169 -0
src/config/__init__.py +5 -0
src/config/__pycache__/__init__.cpython-310.pyc +0 -0
src/config/__pycache__/config.cpython-310.pyc +0 -0
src/config/__pycache__/modal_config.cpython-310.pyc +0 -0
src/config/config.py +81 -0
src/config/modal_config.py +210 -0
src/core/__init__.py +29 -0
src/core/__pycache__/__init__.cpython-310.pyc +0 -0
src/core/__pycache__/audio_splitter.cpython-310.pyc +0 -0
src/core/__pycache__/config.cpython-310.pyc +0 -0
src/core/__pycache__/exceptions.cpython-310.pyc +0 -0
src/core/__pycache__/speaker_diarization.cpython-310.pyc +0 -0
src/core/__pycache__/whisper_transcriber.cpython-310.pyc +0 -0
src/core/audio_splitter.py +90 -0
src/core/config.py +150 -0
src/core/exceptions.py +43 -0
src/core/speaker_diarization.py +126 -0
src/core/whisper_transcriber.py +113 -0
src/deployment/__init__.py +8 -0
src/deployment/deployment_manager.py +153 -0
src/deployment/endpoint_manager.py +76 -0
src/deployment/modal_deployer.py +97 -0
src/interfaces/__init__.py +38 -0
src/interfaces/__pycache__/__init__.cpython-310.pyc +0 -0
src/interfaces/__pycache__/audio_processor.cpython-310.pyc +0 -0
src/interfaces/__pycache__/audio_splitter.cpython-310.pyc +0 -0
src/interfaces/__pycache__/podcast_downloader.cpython-310.pyc +0 -0
src/interfaces/__pycache__/speaker_detector.cpython-310.pyc +0 -0
src/interfaces/__pycache__/speaker_manager.cpython-310.pyc +0 -0
src/interfaces/__pycache__/transcriber.cpython-310.pyc +0 -0
src/interfaces/audio_processor.py +53 -0
src/interfaces/audio_splitter.py +48 -0
src/interfaces/podcast_downloader.py +66 -0
src/interfaces/speaker_detector.py +71 -0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+PodcastMcpGradio - Podcast Processing and Analysis Framework
+A comprehensive framework for podcast downloading, transcription, and analysis
+with MCP (Model Context Protocol) integration and Gradio UI.
+"""
+__version__ = "2.0.0"
+__author__ = "PodcastMcpGradio Team"
+__description__ = "Podcast Processing and Analysis Framework"
+# Core modules will be imported as needed
+__all__ = []

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (507 Bytes). View file

src/__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (5.02 kB). View file

src/adapters/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Adapters for different transcription backends
+"""
+from .transcription_adapter_factory import TranscriptionAdapterFactory
+from .local_adapter import LocalTranscriptionAdapter
+from .modal_adapter import ModalTranscriptionAdapter
+__all__ = [
+    "TranscriptionAdapterFactory",
+    "LocalTranscriptionAdapter",
+    "ModalTranscriptionAdapter"
+]

src/adapters/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (460 Bytes). View file

src/adapters/__pycache__/local_adapter.cpython-310.pyc ADDED Viewed

Binary file (3.21 kB). View file

src/adapters/__pycache__/modal_adapter.cpython-310.pyc ADDED Viewed

Binary file (3.77 kB). View file

src/adapters/__pycache__/transcription_adapter_factory.cpython-310.pyc ADDED Viewed

Binary file (2.44 kB). View file

src/adapters/local_adapter.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Local transcription adapter for direct processing
+"""
+import asyncio
+from typing import List, Optional
+from ..interfaces.transcriber import ITranscriber, TranscriptionResult
+from ..utils.config import AudioProcessingConfig
+from ..utils.errors import TranscriptionError
+class LocalTranscriptionAdapter(ITranscriber):
+    """Adapter for local transcription processing"""
+    def __init__(self, config: Optional[AudioProcessingConfig] = None):
+        self.config = config or AudioProcessingConfig()
+    async def transcribe(
+        self,
+        audio_file_path: str,
+        model_size: str = "turbo",
+        language: Optional[str] = None,
+        enable_speaker_diarization: bool = False
+    ) -> TranscriptionResult:
+        """Transcribe audio using local processing"""
+        try:
+            # Use the new AudioProcessingService instead of old methods
+            from ..services.audio_processing_service import AudioProcessingService
+            from ..models.services import AudioProcessingRequest
+            print(f"🔄 Starting local transcription for: {audio_file_path}")
+            print(f"🚀 Running transcription with {model_size} model...")
+            # Create service and request
+            audio_service = AudioProcessingService()
+            request = AudioProcessingRequest(
+                audio_file_path=audio_file_path,
+                model_size=model_size,
+                language=language,
+                output_format="json",
+                enable_speaker_diarization=enable_speaker_diarization
+            )
+            # Process transcription
+            result = audio_service.transcribe_full_audio(request)
+            # Convert service result to adapter format
+            return self._convert_service_result(result)
+        except Exception as e:
+            raise TranscriptionError(
+                f"Local transcription failed: {str(e)}",
+                model=model_size,
+                audio_file=audio_file_path
+            )
+    def get_supported_models(self) -> List[str]:
+        """Get list of supported model sizes"""
+        return list(self.config.whisper_models.keys())
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported language codes"""
+        # This would normally come from Whisper's supported languages
+        return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
+    def _convert_service_result(self, service_result) -> TranscriptionResult:
+        """Convert service result format to TranscriptionResult"""
+        from ..interfaces.transcriber import TranscriptionSegment
+        # Extract segments from service result if available
+        segments = []
+        if hasattr(service_result, 'segments') and service_result.segments:
+            for seg in service_result.segments:
+                segments.append(TranscriptionSegment(
+                    start=getattr(seg, 'start', 0),
+                    end=getattr(seg, 'end', 0),
+                    text=getattr(seg, 'text', ''),
+                    speaker=getattr(seg, 'speaker', None)
+                ))
+        return TranscriptionResult(
+            text=getattr(service_result, 'text', ''),
+            segments=segments,
+            language=getattr(service_result, 'language_detected', 'unknown'),
+            model_used=getattr(service_result, 'model_used', 'unknown'),
+            audio_duration=getattr(service_result, 'audio_duration', 0),
+            processing_time=getattr(service_result, 'processing_time', 0),
+            speaker_diarization_enabled=getattr(service_result, 'speaker_diarization_enabled', False),
+            global_speaker_count=getattr(service_result, 'global_speaker_count', 0),
+            error_message=getattr(service_result, 'error_message', None)
+        )

src/adapters/modal_adapter.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Modal transcription adapter for remote processing
+"""
+import requests
+import base64
+import pathlib
+from typing import List, Optional
+from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
+from ..utils.config import AudioProcessingConfig
+from ..utils.errors import TranscriptionError
+class ModalTranscriptionAdapter(ITranscriber):
+    """Adapter for Modal remote transcription processing"""
+    def __init__(self, config: Optional[AudioProcessingConfig] = None, endpoint_url: Optional[str] = None):
+        self.config = config or AudioProcessingConfig()
+        self.endpoint_url = endpoint_url
+    async def transcribe(
+        self,
+        audio_file_path: str,
+        model_size: str = "turbo",
+        language: Optional[str] = None,
+        enable_speaker_diarization: bool = False
+    ) -> TranscriptionResult:
+        """Transcribe audio using Modal endpoint"""
+        if not self.endpoint_url:
+            raise TranscriptionError(
+                "Modal endpoint URL not configured",
+                model=model_size,
+                audio_file=audio_file_path
+            )
+        try:
+            # Read and encode audio file
+            audio_path = pathlib.Path(audio_file_path)
+            if not audio_path.exists():
+                raise TranscriptionError(
+                    f"Audio file not found: {audio_file_path}",
+                    audio_file=audio_file_path
+                )
+            with open(audio_path, 'rb') as f:
+                audio_data = f.read()
+            audio_base64 = base64.b64encode(audio_data).decode('utf-8')
+            # Prepare request data
+            request_data = {
+                "audio_file_data": audio_base64,
+                "audio_file_name": audio_path.name,
+                "model_size": model_size,
+                "language": language,
+                "output_format": "json",
+                "enable_speaker_diarization": enable_speaker_diarization
+            }
+            print(f"🔄 Sending transcription request to Modal endpoint")
+            print(f"📁 File: {audio_file_path} ({len(audio_data) / (1024*1024):.2f} MB)")
+            print(f"🔧 Model: {model_size}, Speaker diarization: {enable_speaker_diarization}")
+            # Make request to Modal endpoint
+            response = requests.post(
+                self.endpoint_url,
+                json=request_data,
+                timeout=1800  # 30 minutes timeout
+            )
+            response.raise_for_status()
+            result = response.json()
+            print(f"✅ Modal transcription completed")
+            # Convert result to TranscriptionResult format
+            return self._convert_modal_result(result)
+        except requests.exceptions.RequestException as e:
+            raise TranscriptionError(
+                f"Failed to call Modal endpoint: {str(e)}",
+                model=model_size,
+                audio_file=audio_file_path
+            )
+        except Exception as e:
+            raise TranscriptionError(
+                f"Modal transcription failed: {str(e)}",
+                model=model_size,
+                audio_file=audio_file_path
+            )
+    def get_supported_models(self) -> List[str]:
+        """Get list of supported model sizes"""
+        return list(self.config.whisper_models.keys())
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported language codes"""
+        return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
+    def _convert_modal_result(self, modal_result: dict) -> TranscriptionResult:
+        """Convert Modal result format to TranscriptionResult"""
+        # Extract segments if available
+        segments = []
+        if "segments" in modal_result:
+            for seg in modal_result["segments"]:
+                segments.append(TranscriptionSegment(
+                    start=seg.get("start", 0),
+                    end=seg.get("end", 0),
+                    text=seg.get("text", ""),
+                    speaker=seg.get("speaker")
+                ))
+        return TranscriptionResult(
+            text=modal_result.get("text", ""),
+            segments=segments,
+            language=modal_result.get("language_detected", "unknown"),
+            model_used=modal_result.get("model_used", "unknown"),
+            audio_duration=modal_result.get("audio_duration", 0),
+            processing_time=modal_result.get("processing_time", 0),
+            speaker_diarization_enabled=modal_result.get("speaker_diarization_enabled", False),
+            global_speaker_count=modal_result.get("global_speaker_count", 0),
+            error_message=modal_result.get("error_message")
+        )

src/adapters/transcription_adapter_factory.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Factory for creating transcription adapters
+"""
+import os
+from typing import Optional
+from ..interfaces.transcriber import ITranscriber
+from ..utils.config import AudioProcessingConfig
+from ..utils.errors import ConfigurationError
+from .local_adapter import LocalTranscriptionAdapter
+from .modal_adapter import ModalTranscriptionAdapter
+class TranscriptionAdapterFactory:
+    """Factory for creating appropriate transcription adapters"""
+    @staticmethod
+    def create_adapter(
+        deployment_mode: str = "auto",
+        config: Optional[AudioProcessingConfig] = None,
+        endpoint_url: Optional[str] = None
+    ) -> ITranscriber:
+        """
+        Create transcription adapter based on deployment mode
+        Args:
+            deployment_mode: "local", "modal", or "auto"
+            config: Configuration object
+            endpoint_url: Modal endpoint URL (for modal/auto mode)
+        Returns:
+            ITranscriber: Appropriate transcription adapter
+        """
+        config = config or AudioProcessingConfig()
+        # Auto mode: decide based on environment and endpoint availability
+        if deployment_mode == "auto":
+            if endpoint_url:
+                print(f"🌐 Auto mode: Using Modal adapter with endpoint {endpoint_url}")
+                return ModalTranscriptionAdapter(config=config, endpoint_url=endpoint_url)
+            else:
+                print(f"🏠 Auto mode: Using Local adapter (no endpoint configured)")
+                return LocalTranscriptionAdapter(config=config)
+        # Explicit local mode
+        elif deployment_mode == "local":
+            print(f"🏠 Using Local transcription adapter")
+            return LocalTranscriptionAdapter(config=config)
+        # Explicit modal mode
+        elif deployment_mode == "modal":
+            if not endpoint_url:
+                raise ConfigurationError(
+                    "Modal endpoint URL is required for modal mode",
+                    config_key="endpoint_url"
+                )
+            print(f"🌐 Using Modal transcription adapter with endpoint {endpoint_url}")
+            return ModalTranscriptionAdapter(config=config, endpoint_url=endpoint_url)
+        else:
+            raise ConfigurationError(
+                f"Unsupported deployment mode: {deployment_mode}. Use 'local', 'modal', or 'auto'",
+                config_key="deployment_mode"
+            )
+    @staticmethod
+    def _detect_deployment_mode() -> str:
+        """Auto-detect deployment mode based on environment"""
+        import os
+        # Check if running in Modal environment
+        if os.environ.get("MODAL_TASK_ID"):
+            return "local"  # We're inside Modal, use local processing
+        else:
+            return "modal"  # We're outside Modal, use remote endpoint

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+API Module - External interfaces and endpoints
+"""
+__all__ = []

src/api/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (223 Bytes). View file

src/api/__pycache__/transcription_api.cpython-310.pyc ADDED Viewed

Binary file (3.27 kB). View file

src/api/transcription_api.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Transcription API module
+"""
+import os
+from typing import Optional, Dict, Any
+from ..adapters import TranscriptionAdapterFactory
+from ..services import TranscriptionService
+from ..core import FFmpegAudioSplitter
+from ..utils import AudioProcessingConfig, AudioProcessingError
+class TranscriptionAPI:
+    """High-level API for transcription operations"""
+    def __init__(self, config: Optional[AudioProcessingConfig] = None):
+        self.config = config or AudioProcessingConfig()
+        self.transcription_service = None
+        self._initialize_service()
+    def _initialize_service(self):
+        """Initialize transcription service with appropriate adapter"""
+        try:
+            # Get endpoint URL from config file if available
+            endpoint_url = self._get_endpoint_url()
+            # Create appropriate adapter
+            transcriber = TranscriptionAdapterFactory.create_adapter(
+                deployment_mode="auto",
+                config=self.config,
+                endpoint_url=endpoint_url
+            )
+            # Create audio splitter
+            audio_splitter = FFmpegAudioSplitter()
+            # Create transcription service
+            self.transcription_service = TranscriptionService(
+                transcriber=transcriber,
+                audio_splitter=audio_splitter,
+                speaker_detector=None,  # TODO: Add speaker detector when implemented
+                config=self.config
+            )
+        except Exception as e:
+            print(f"⚠️ Failed to initialize transcription service: {e}")
+            raise AudioProcessingError(f"Service initialization failed: {e}")
+    def _get_endpoint_url(self) -> Optional[str]:
+        """Get Modal endpoint URL from configuration"""
+        try:
+            import json
+            config_file = "endpoint_config.json"
+            if os.path.exists(config_file):
+                with open(config_file, 'r') as f:
+                    config = json.load(f)
+                return config.get("transcribe_audio")
+        except Exception:
+            pass
+        return None
+    async def transcribe_audio_file(
+        self,
+        audio_file_path: str,
+        model_size: str = "turbo",
+        language: Optional[str] = None,
+        output_format: str = "srt",
+        enable_speaker_diarization: bool = False
+    ) -> Dict[str, Any]:
+        """Transcribe audio file using the configured service"""
+        if not self.transcription_service:
+            raise AudioProcessingError("Transcription service not initialized")
+        return await self.transcription_service.transcribe_audio_file(
+            audio_file_path=audio_file_path,
+            model_size=model_size,
+            language=language,
+            output_format=output_format,
+            enable_speaker_diarization=enable_speaker_diarization
+        )
+# Create global API instance
+_api_instance = None
+def get_transcription_api() -> TranscriptionAPI:
+    """Get global transcription API instance"""
+    global _api_instance
+    if _api_instance is None:
+        _api_instance = TranscriptionAPI()
+    return _api_instance
+async def transcribe_audio_adaptive_sync(
+    audio_file_path: str,
+    model_size: str = "turbo",
+    language: str = None,
+    output_format: str = "srt",
+    enable_speaker_diarization: bool = False
+) -> Dict[str, Any]:
+    """
+    Adaptive transcription function that routes to appropriate backend
+    """
+    api = get_transcription_api()
+    return await api.transcribe_audio_file(
+        audio_file_path=audio_file_path,
+        model_size=model_size,
+        language=language,
+        output_format=output_format,
+        enable_speaker_diarization=enable_speaker_diarization
+    )

src/app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# FastAPI + Gradio + FastMCP MCP server main entry point
+import modal
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from gradio.routes import mount_gradio_app
+import os
+from dotenv import load_dotenv
+import uvicorn
+from mcp.server.fastmcp import FastMCP
+# Import modules
+from .tools import mcp_tools  # Import the module, not get_mcp_server function
+from .ui.gradio_ui import create_gradio_interface
+from .config.config import is_modal_mode, is_local_mode
+# Always import modal config since this module might be imported in modal context
+try:
+    from .config.modal_config import app, image, volume, cache_dir, secrets
+    _modal_available = True
+except ImportError:
+    _modal_available = False
+# ==================== Application Creation Function ====================
+def create_app():
+    """Create and return complete Gradio + MCP application"""
+    print("🚀 Starting Gradio + FastMCP server")
+    # Create FastMCP server with new tools
+    mcp = FastMCP("Podcast MCP")
+    # Register tools using the new service architecture
+    @mcp.tool(description="Transcribe audio files to text using Whisper model with speaker diarization support")
+    async def transcribe_audio_file_tool(
+        audio_file_path: str,
+        model_size: str = "turbo",
+        language: str = None,
+        output_format: str = "srt",
+        enable_speaker_diarization: bool = False
+    ):
+        return await mcp_tools.transcribe_audio_file(
+            audio_file_path, model_size, language, output_format, enable_speaker_diarization
+        )
+    @mcp.tool(description="Download Apple Podcast audio files")
+    async def download_apple_podcast_tool(url: str):
+        return await mcp_tools.download_apple_podcast(url)
+    @mcp.tool(description="Download XiaoYuZhou podcast audio files")
+    async def download_xyz_podcast_tool(url: str):
+        return await mcp_tools.download_xyz_podcast(url)
+    @mcp.tool(description="Scan directory for MP3 audio files")
+    async def get_mp3_files_tool(directory: str):
+        return await mcp_tools.get_mp3_files(directory)
+    @mcp.tool(description="Get basic file information")
+    async def get_file_info_tool(file_path: str):
+        return await mcp_tools.get_file_info(file_path)
+    @mcp.tool(description="Read text file content in segments")
+    async def read_text_file_segments_tool(
+        file_path: str,
+        chunk_size: int = 65536,
+        start_position: int = 0
+    ):
+        return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
+    # Create FastAPI wrapper
+    fastapi_wrapper = FastAPI(
+        title="Modal AudioTranscriber MCP",
+        description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
+        version="1.0.0",
+        lifespan=lambda app: mcp.session_manager.run()
+    )
+    # Get FastMCP's streamable HTTP app
+    mcp_app = mcp.streamable_http_app()
+    # Mount FastMCP application to /api path
+    fastapi_wrapper.mount("/api", mcp_app)
+    # Create Gradio interface
+    ui_app = create_gradio_interface()
+    # Use Gradio's standard mounting approach
+    final_app = mount_gradio_app(
+        app=fastapi_wrapper,
+        blocks=ui_app,
+        path="/",
+        app_kwargs={
+            "docs_url": "/docs",
+            "redoc_url": "/redoc",
+        }
+    )
+    print("✅ Server startup completed")
+    print("🎨 Gradio UI: /")
+    print("🔧 MCP Streamable HTTP: /api/mcp")
+    print(f"📝 Server name: {mcp.name}")
+    return final_app
+# ==================== Modal Deployment Configuration ====================
+# Create a separate Modal app for the Gradio interface
+if _modal_available:
+    gradio_mcp_app = modal.App(name="gradio-mcp-ui")
+    @gradio_mcp_app.function(
+        image=image,
+        cpu=2,  # Adequate CPU for UI operations
+        memory=4096,  # 4GB memory for stable UI performance
+        max_containers=5,  # Reduced to control resource usage
+        min_containers=1,  # Keep minimum containers for faster response
+        scaledown_window=600,  # 20 minutes before scaling down
+        timeout=1800,  # 30 minutes timeout to prevent preemption
+        volumes={cache_dir: volume},
+        secrets=secrets,
+    )
+    @modal.concurrent(max_inputs=100)
+    @modal.asgi_app()
+    def app_entry():
+        """Modal deployment function - create and return complete Gradio + MCP application"""
+        return create_app()
+# ==================== Main Entry Point ====================
+def main():
+    """Main entry point for all deployment modes"""
+    if is_modal_mode():
+        print("☁️ Modal mode: Use 'modal deploy src.app::gradio_mcp_app'")
+        return None
+    else:
+        print("🏠 Starting in local mode")
+        print("💡 GPU functions will be routed to Modal endpoints")
+        app = create_app()
+        return app
+def run_local():
+    """Run local server with uvicorn (for direct execution)"""
+    app = main()
+    if app:
+        uvicorn.run(
+            app,
+            host="0.0.0.0",
+            port=8000,
+            reload=False
+        )
+# ==================== Hugging Face Spaces Support ====================
+# For Hugging Face Spaces, directly create the app
+def get_app():
+    """Get app instance for HF Spaces"""
+    if "DEPLOYMENT_MODE" not in os.environ:
+        os.environ["DEPLOYMENT_MODE"] = "local"
+    return main()
+# Create app for HF Spaces when imported
+if __name__ != "__main__":
+    app = get_app()
+if __name__ == "__main__":
+    run_local()

src/config/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+Config Module - Configuration management
+"""
+__all__ = []

src/config/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (220 Bytes). View file

src/config/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (2.78 kB). View file

src/config/__pycache__/modal_config.cpython-310.pyc ADDED Viewed

Binary file (4.98 kB). View file

src/config/config.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Deployment configuration for Gradio + MCP Server
+Supports two deployment modes:
+1. Local mode: Gradio runs locally, GPU functions call Modal endpoints
+2. Modal mode: Gradio runs on Modal, GPU functions run locally on Modal
+"""
+import os
+from enum import Enum
+from typing import Optional
+class DeploymentMode(Enum):
+    LOCAL = "local"      # Local Gradio + Remote GPU (Modal endpoints)
+    MODAL = "modal"      # Modal Gradio + Local GPU (Modal functions)
+# Get deployment mode from environment variable
+DEPLOYMENT_MODE = DeploymentMode(os.getenv("DEPLOYMENT_MODE", "local"))
+# Modal endpoints configuration
+MODAL_APP_NAME = "gradio-mcp-server"
+# Endpoint URLs (will be set when deployed)
+ENDPOINTS = {
+    "transcribe_audio": None,  # Will be filled with actual endpoint URL
+}
+def get_deployment_mode() -> DeploymentMode:
+    """Get current deployment mode"""
+    return DEPLOYMENT_MODE
+def is_local_mode() -> bool:
+    """Check if running in local mode"""
+    return DEPLOYMENT_MODE == DeploymentMode.LOCAL
+def is_modal_mode() -> bool:
+    """Check if running in modal mode"""
+    return DEPLOYMENT_MODE == DeploymentMode.MODAL
+def set_endpoint_url(endpoint_name: str, url: str):
+    """Set endpoint URL for local mode"""
+    global ENDPOINTS
+    ENDPOINTS[endpoint_name] = url
+def get_endpoint_url(endpoint_name: str) -> Optional[str]:
+    """Get endpoint URL for local mode"""
+    return ENDPOINTS.get(endpoint_name)
+def get_transcribe_endpoint_url() -> Optional[str]:
+    """Get transcription endpoint URL"""
+    return get_endpoint_url("transcribe_audio")
+# Environment-specific cache directory
+def get_cache_dir() -> str:
+    """Get cache directory based on deployment mode"""
+    if is_modal_mode():
+        return "/root/cache"
+    else:
+        # Local mode - use user's home directory
+        home_dir = os.path.expanduser("~")
+        cache_dir = os.path.join(home_dir, ".gradio_mcp_cache")
+        os.makedirs(cache_dir, exist_ok=True)
+        return cache_dir
+# Auto-load endpoint configuration in local mode
+if is_local_mode():
+    import json
+    config_file = "endpoint_config.json"
+    if os.path.exists(config_file):
+        try:
+            with open(config_file, 'r') as f:
+                config = json.load(f)
+            for endpoint_name, url in config.items():
+                set_endpoint_url(endpoint_name, url)
+            print(f"✅ Loaded endpoint configuration from {config_file}")
+        except Exception as e:
+            print(f"⚠️ Failed to load endpoint configuration: {e}")
+    else:
+        print(f"⚠️ No endpoint configuration found. Run 'python deploy_endpoints.py deploy' first.")
+print(f"🚀 Deployment mode: {DEPLOYMENT_MODE.value}")
+print(f"📁 Cache directory: {get_cache_dir()}")

src/config/modal_config.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import modal
+import os
+# Create Modal application
+app = modal.App(name="gradio-mcp-server")
+# Try to get Hugging Face token from Modal secrets (required for speaker diarization)
+try:
+    hf_secret = modal.Secret.from_name("huggingface-secret")
+    print("✅ Found Hugging Face secret configuration")
+except Exception:
+    hf_secret = None
+    print("⚠️ Hugging Face secret not found, speaker diarization will be disabled")
+# Create mounted volume
+volume = modal.Volume.from_name("cache-volume", create_if_missing=True)
+cache_dir = "/root/cache"
+# Model preloading function
+def download_models() -> None:
+    """Download and cache Whisper and speaker diarization models"""
+    import whisper
+    import os
+    from pathlib import Path
+    # Create model cache directory
+    model_cache_dir = Path("/model")
+    model_cache_dir.mkdir(exist_ok=True)
+    print("📥 Downloading Whisper turbo model...")
+    # Download and cache Whisper turbo model
+    whisper_model = whisper.load_model("turbo", download_root="/model")
+    print("✅ Whisper turbo model downloaded and cached")
+    # Download speaker diarization models if HF token is available
+    if os.environ.get("HF_TOKEN"):
+        try:
+            print("📥 Downloading speaker diarization models...")
+            from pyannote.audio import Pipeline, Model
+            from pyannote.audio.core.inference import Inference
+            import torch
+            # Set proper cache directory for pyannote
+            os.environ["PYANNOTE_CACHE"] = "/model/speaker-diarization"
+            # Download and cache speaker diarization pipeline
+            # This will automatically cache to the PYANNOTE_CACHE directory
+            pipeline = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+                use_auth_token=os.environ["HF_TOKEN"],
+                cache_dir="/model/speaker-diarization"
+            )
+            # Preload speaker embedding model for speaker identification
+            print("📥 Downloading speaker embedding model...")
+            embedding_model = Model.from_pretrained(
+                "pyannote/embedding",
+                use_auth_token=os.environ["HF_TOKEN"],
+                cache_dir="/model/speaker-embedding"
+            )
+            # Set device for models
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            embedding_model.to(device)
+            embedding_model.eval()
+            # Create inference object for embedding extraction
+            inference = Inference(embedding_model, window="whole")
+            # Verify the pipeline works
+            print("🧪 Testing speaker diarization pipeline...")
+            # Create a simple marker file to indicate successful download
+            import json
+            speaker_dir = Path("/model/speaker-diarization")
+            speaker_dir.mkdir(exist_ok=True, parents=True)
+            embedding_dir = Path("/model/speaker-embedding")
+            embedding_dir.mkdir(exist_ok=True, parents=True)
+            config = {
+                "model_name": "pyannote/speaker-diarization-3.1",
+                "embedding_model_name": "pyannote/embedding",
+                "cached_at": str(speaker_dir),
+                "embedding_cached_at": str(embedding_dir),
+                "cache_complete": True,
+                "embedding_cache_complete": True,
+                "pyannote_cache_env": "/model/speaker-diarization",
+                "device": str(device)
+            }
+            with open(speaker_dir / "download_complete.json", "w") as f:
+                json.dump(config, f)
+            print("✅ Speaker diarization and embedding models downloaded and cached")
+        except Exception as e:
+            print(f"⚠️ Failed to download speaker diarization models: {e}")
+    else:
+        print("⚠️ No HF_TOKEN found, skipping speaker diarization model download")
+# Create image environment with model preloading
+image = modal.Image.debian_slim(python_version="3.11").apt_install(
+    # Basic tools
+    "ffmpeg",
+    "wget",
+    "curl",
+    "unzip",
+    "gnupg2",
+    "git",  # Required by Whisper
+    # Chrome dependencies
+    "libglib2.0-0",
+    "libnss3",
+    "libatk-bridge2.0-0",
+    "libdrm2",
+    "libxkbcommon0",
+    "libxcomposite1",
+    "libxdamage1",
+    "libxrandr2",
+    "libgbm1",
+    "libxss1",
+    "libasound2"
+).run_commands(
+    # Download and install Chrome directly (faster method)
+    "wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb",
+    "apt-get install -y ./google-chrome-stable_current_amd64.deb || apt-get install -y -f",
+    "rm google-chrome-stable_current_amd64.deb"
+).pip_install(
+    # Web frameworks and basic libraries
+    "gradio>=5.31.0",
+    "fastapi",
+    "pydantic",
+    "python-dotenv",
+    # MCP related
+    "mcp[cli]",
+    "fastmcp>=2.7.0",
+    "starlette",
+    # Network and parsing
+    "beautifulsoup4",
+    "selenium",
+    "requests",
+    # Whisper and audio processing related
+    "git+https://github.com/openai/whisper.git",
+    "ffmpeg-python",
+    "torchaudio==2.1.0",
+    "numpy<2",
+    # Audio processing dependencies
+    "librosa",
+    "soundfile",
+    # Other Whisper ecosystem dependencies
+    "dacite",
+    "jiwer",
+    "pandas",
+    "loguru==0.6.0",
+    # GraphQL client (if needed)
+    "gql[all]~=3.0.0a5",
+    # Speaker diarization related dependencies
+    "pyannote.audio==3.1.0",
+    # System monitoring
+    "psutil",
+).run_function(
+    download_models,
+    secrets=[hf_secret] if hf_secret else []
+)
+# Update file paths to reflect new structure
+image = image.add_local_dir("../src", remote_path="/root/src")
+secrets = [hf_secret] if hf_secret else []
+# ==================== Modal Endpoints Configuration ====================
+@app.function(
+    image=image,
+    volumes={cache_dir: volume},
+    cpu=4,  # Increased CPU for better performance
+    memory=8192,  # 8GB memory for stable transcription
+    gpu="A10G",
+    timeout=1800,  # 30 minutes timeout for speaker diarization support
+    scaledown_window=40,  # 15 minutes before scaling down
+    secrets=secrets,
+)
+@modal.fastapi_endpoint(method="POST", label="transcribe-audio-chunk-endpoint")
+def transcribe_audio_chunk_endpoint(request_data: dict):
+    """FastAPI endpoint for transcribing a single audio chunk (for distributed processing)"""
+    import sys
+    sys.path.append('/root')
+    from src.services.modal_transcription_service import ModalTranscriptionService
+    modal_service = ModalTranscriptionService(cache_dir="/root/cache", use_direct_modal_calls=True)
+    return modal_service.process_chunk_request(request_data)
+@app.function(
+    image=image,
+    cpu=2,  # Increased CPU for better health check performance
+    memory=2048,  # 2GB memory for stability
+    timeout=300,  # 5 minutes timeout for health checks
+    scaledown_window=600,  # 10 minutes before scaling down
+    secrets=secrets,
+)
+@modal.fastapi_endpoint(method="GET", label="health-check-endpoint")
+def health_check_endpoint():
+    """Health check endpoint to verify service status"""
+    import sys
+    sys.path.append('/root')
+    from src.services.health_service import HealthService
+    health_service = HealthService()
+    return health_service.get_health_status()

src/core/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Core components for application and audio processing
+"""
+# Original core components
+from .config import AppConfig, app_config, get_deployment_mode, is_local_mode, is_modal_mode
+from .exceptions import AppError, ConfigError, ValidationError
+# Audio processing core components
+from .audio_splitter import FFmpegAudioSplitter
+from .whisper_transcriber import WhisperTranscriber
+from .speaker_diarization import PyannoteSpeikerDetector
+__all__ = [
+    # Original core
+    "AppConfig",
+    "app_config",
+    "get_deployment_mode",
+    "is_local_mode",
+    "is_modal_mode",
+    "AppError",
+    "ConfigError",
+    "ValidationError",
+    # Audio processing core
+    "FFmpegAudioSplitter",
+    "WhisperTranscriber",
+    "PyannoteSpeikerDetector"
+]

src/core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (713 Bytes). View file

src/core/__pycache__/audio_splitter.cpython-310.pyc ADDED Viewed

Binary file (2.32 kB). View file

src/core/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (5.14 kB). View file

src/core/__pycache__/exceptions.cpython-310.pyc ADDED Viewed

Binary file (1.39 kB). View file

src/core/__pycache__/speaker_diarization.cpython-310.pyc ADDED Viewed

Binary file (3.78 kB). View file

src/core/__pycache__/whisper_transcriber.cpython-310.pyc ADDED Viewed

Binary file (3.41 kB). View file

src/core/audio_splitter.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Audio splitter implementation using FFmpeg
+"""
+import re
+from typing import Iterator
+import ffmpeg
+from ..interfaces.audio_splitter import IAudioSplitter, AudioSegment
+from ..utils.errors import AudioSplittingError
+class FFmpegAudioSplitter(IAudioSplitter):
+    """Audio splitter using FFmpeg's silence detection"""
+    def split_audio(
+        self,
+        audio_path: str,
+        min_segment_length: float = 30.0,
+        min_silence_length: float = 1.0
+    ) -> Iterator[AudioSegment]:
+        """Split audio by silence detection"""
+        try:
+            silence_end_re = re.compile(
+                r" silence_end: (?P<end>[0-9]+(\.?[0-9]*)) \| silence_duration: (?P<dur>[0-9]+(\.?[0-9]*))"
+            )
+            # Get audio duration
+            duration = self.get_audio_duration(audio_path)
+            # Use silence detection filter
+            reader = (
+                ffmpeg.input(str(audio_path))
+                .filter("silencedetect", n="-10dB", d=min_silence_length)
+                .output("pipe:", format="null")
+                .run_async(pipe_stderr=True)
+            )
+            cur_start = 0.0
+            segment_count = 0
+            while True:
+                line = reader.stderr.readline().decode("utf-8")
+                if not line:
+                    break
+                match = silence_end_re.search(line)
+                if match:
+                    silence_end, silence_dur = match.group("end"), match.group("dur")
+                    split_at = float(silence_end) - (float(silence_dur) / 2)
+                    if (split_at - cur_start) < min_segment_length:
+                        continue
+                    yield AudioSegment(
+                        start=cur_start,
+                        end=split_at,
+                        duration=split_at - cur_start
+                    )
+                    cur_start = split_at
+                    segment_count += 1
+            # Handle the last segment
+            if duration > cur_start:
+                yield AudioSegment(
+                    start=cur_start,
+                    end=duration,
+                    duration=duration - cur_start
+                )
+                segment_count += 1
+            print(f"Audio split into {segment_count} segments")
+        except Exception as e:
+            raise AudioSplittingError(
+                f"Failed to split audio: {str(e)}",
+                audio_file=audio_path
+            )
+    def get_audio_duration(self, audio_path: str) -> float:
+        """Get total duration of audio file"""
+        try:
+            metadata = ffmpeg.probe(audio_path)
+            return float(metadata["format"]["duration"])
+        except Exception as e:
+            raise AudioSplittingError(
+                f"Failed to get audio duration: {str(e)}",
+                audio_file=audio_path
+            )

src/core/config.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""
+Configuration management for PodcastMCP
+"""
+import os
+import json
+from enum import Enum
+from typing import Optional, Dict, Any
+from pathlib import Path
+class DeploymentMode(Enum):
+    """部署模式枚举"""
+    LOCAL = "local"       # 本地Gradio + Modal GPU endpoints
+    MODAL = "modal"       # 完全在Modal平台运行
+    HF_SPACES = "hf"      # Hugging Face Spaces部署
+class AppConfig:
+    """应用配置管理器"""
+    def __init__(self):
+        self._deployment_mode = self._detect_deployment_mode()
+        self._cache_dir = self._get_cache_directory()
+        self._endpoints = self._load_endpoints()
+    @property
+    def deployment_mode(self) -> DeploymentMode:
+        """获取当前部署模式"""
+        return self._deployment_mode
+    @property
+    def cache_dir(self) -> str:
+        """获取缓存目录"""
+        return self._cache_dir
+    @property
+    def is_local_mode(self) -> bool:
+        """是否为本地模式"""
+        return self._deployment_mode == DeploymentMode.LOCAL
+    @property
+    def is_modal_mode(self) -> bool:
+        """是否为Modal模式"""
+        return self._deployment_mode == DeploymentMode.MODAL
+    @property
+    def is_hf_spaces_mode(self) -> bool:
+        """是否为HF Spaces模式"""
+        return self._deployment_mode == DeploymentMode.HF_SPACES
+    def get_transcribe_endpoint_url(self) -> Optional[str]:
+        """获取转录endpoint URL"""
+        return self._endpoints.get("transcribe_audio")
+    def set_endpoint_url(self, service: str, url: str):
+        """设置endpoint URL"""
+        self._endpoints[service] = url
+        self._save_endpoints()
+    def _detect_deployment_mode(self) -> DeploymentMode:
+        """自动检测部署模式"""
+        # 检查环境变量
+        mode = os.environ.get("DEPLOYMENT_MODE", "").lower()
+        if mode == "modal":
+            return DeploymentMode.MODAL
+        elif mode == "hf":
+            return DeploymentMode.HF_SPACES
+        # 检查是否在HF Spaces环境
+        if os.environ.get("SPACE_ID") or os.environ.get("SPACES_ZERO_GPU"):
+            return DeploymentMode.HF_SPACES
+        # 检查是否在Modal环境
+        if os.environ.get("MODAL_TASK_ID") or os.environ.get("MODAL_IS_INSIDE_CONTAINER"):
+            return DeploymentMode.MODAL
+        # 默认为本地模式
+        return DeploymentMode.LOCAL
+    def _get_cache_directory(self) -> str:
+        """获取缓存目录路径"""
+        if self.is_modal_mode:
+            return "/root/cache"
+        else:
+            # 本地模式和HF Spaces使用用户缓存目录
+            home_dir = Path.home()
+            cache_dir = home_dir / ".gradio_mcp_cache"
+            cache_dir.mkdir(exist_ok=True)
+            return str(cache_dir)
+    def _load_endpoints(self) -> Dict[str, str]:
+        """加载endpoint配置"""
+        config_file = Path("endpoint_config.json")
+        if config_file.exists():
+            try:
+                with open(config_file, 'r') as f:
+                    endpoints = json.load(f)
+                print(f"✅ Loaded endpoint configuration from {config_file}")
+                return endpoints
+            except Exception as e:
+                print(f"⚠️ Failed to load endpoint config: {e}")
+        else:
+            print("⚠️ No endpoint configuration found. Run deployment first.")
+        return {}
+    def _save_endpoints(self):
+        """保存endpoint配置"""
+        config_file = Path("endpoint_config.json")
+        try:
+            with open(config_file, 'w') as f:
+                json.dump(self._endpoints, f, indent=2)
+            print(f"💾 Endpoint configuration saved to {config_file}")
+        except Exception as e:
+            print(f"⚠️ Failed to save endpoint config: {e}")
+# 全局配置实例
+app_config = AppConfig()
+# 向后兼容的函数接口
+def get_deployment_mode() -> str:
+    """获取部署模式字符串"""
+    return app_config.deployment_mode.value
+def is_local_mode() -> bool:
+    """是否为本地模式"""
+    return app_config.is_local_mode
+def is_modal_mode() -> bool:
+    """是否为Modal模式"""
+    return app_config.is_modal_mode
+def get_cache_dir() -> str:
+    """获取缓存目录"""
+    return app_config.cache_dir
+def get_transcribe_endpoint_url() -> Optional[str]:
+    """获取转录endpoint URL"""
+    return app_config.get_transcribe_endpoint_url()
+def set_endpoint_url(service: str, url: str):
+    """设置endpoint URL"""
+    app_config.set_endpoint_url(service, url)
+# 打印配置信息
+print(f"🚀 Deployment mode: {app_config.deployment_mode.value}")
+print(f"📁 Cache directory: {app_config.cache_dir}")

src/core/exceptions.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Custom exceptions for PodcastMCP
+"""
+class PodcastMCPError(Exception):
+    """PodcastMCP基础异常类"""
+    pass
+class AppError(PodcastMCPError):
+    """应用程序异常"""
+    pass
+class ConfigError(PodcastMCPError):
+    """配置相关异常"""
+    pass
+class ValidationError(PodcastMCPError):
+    """验证相关异常"""
+    pass
+class TranscriptionError(PodcastMCPError):
+    """转录相关异常"""
+    pass
+class DeploymentError(PodcastMCPError):
+    """部署相关异常"""
+    pass
+class FileNotFoundError(PodcastMCPError):
+    """文件未找到异常"""
+    pass
+class EndpointError(PodcastMCPError):
+    """Endpoint相关异常"""
+    pass

src/core/speaker_diarization.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Speaker diarization implementation using pyannote.audio
+"""
+import os
+import torch
+from typing import Optional, List, Dict, Any
+from ..interfaces.speaker_detector import ISpeakerDetector
+from ..utils.config import AudioProcessingConfig
+from ..utils.errors import SpeakerDiarizationError, ModelLoadError
+class PyannoteSpeikerDetector(ISpeakerDetector):
+    """Speaker diarization using pyannote.audio"""
+    def __init__(self, config: Optional[AudioProcessingConfig] = None):
+        self.config = config or AudioProcessingConfig()
+        self.device = self._setup_device()
+        self.pipeline = None
+        self.auth_token = os.environ.get(self.config.hf_token_env_var)
+        if not self.auth_token:
+            print("⚠️ No Hugging Face token found. Speaker diarization will be disabled.")
+    def _setup_device(self) -> torch.device:
+        """Setup and return the best available device"""
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        else:
+            return torch.device("cpu")
+    async def detect_speakers(
+        self,
+        audio_file_path: str,
+        num_speakers: Optional[int] = None,
+        min_speakers: int = 1,
+        max_speakers: int = 10
+    ) -> Dict[str, Any]:
+        """Detect speakers in audio file"""
+        if not self.auth_token:
+            raise SpeakerDiarizationError(
+                "Speaker diarization requires Hugging Face token",
+                audio_file=audio_file_path
+            )
+        try:
+            # Load pipeline if not already loaded
+            if self.pipeline is None:
+                self.pipeline = self._load_pipeline()
+            # Perform diarization
+            diarization = self.pipeline(audio_file_path)
+            # Convert to our format
+            speakers = {}
+            segments = []
+            for turn, _, speaker in diarization.itertracks(yield_label=True):
+                speaker_id = f"SPEAKER_{speaker.split('_')[-1].zfill(2)}"
+                segments.append({
+                    "start": turn.start,
+                    "end": turn.end,
+                    "speaker": speaker_id
+                })
+                if speaker_id not in speakers:
+                    speakers[speaker_id] = {
+                        "id": speaker_id,
+                        "total_time": 0.0,
+                        "segments": []
+                    }
+                speakers[speaker_id]["total_time"] += turn.end - turn.start
+                speakers[speaker_id]["segments"].append({
+                    "start": turn.start,
+                    "end": turn.end
+                })
+            return {
+                "speaker_count": len(speakers),
+                "speakers": speakers,
+                "segments": segments,
+                "audio_file": audio_file_path
+            }
+        except Exception as e:
+            raise SpeakerDiarizationError(
+                f"Speaker detection failed: {str(e)}",
+                audio_file=audio_file_path
+            )
+    def _load_pipeline(self):
+        """Load pyannote speaker diarization pipeline"""
+        try:
+            # Suppress warnings
+            import warnings
+            warnings.filterwarnings("ignore", category=UserWarning, module="pyannote")
+            warnings.filterwarnings("ignore", category=UserWarning, module="pytorch_lightning")
+            warnings.filterwarnings("ignore", category=FutureWarning, module="pytorch_lightning")
+            from pyannote.audio import Pipeline
+            print("📥 Loading speaker diarization pipeline...")
+            pipeline = Pipeline.from_pretrained(
+                self.config.speaker_diarization_model,
+                use_auth_token=self.auth_token
+            )
+            pipeline.to(self.device)
+            return pipeline
+        except Exception as e:
+            raise ModelLoadError(
+                f"Failed to load speaker diarization pipeline: {str(e)}",
+                model_name=self.config.speaker_diarization_model
+            )
+    def get_supported_models(self) -> List[str]:
+        """Get list of supported speaker diarization models"""
+        return [self.config.speaker_diarization_model]
+    def is_available(self) -> bool:
+        """Check if speaker diarization is available"""
+        return self.auth_token is not None

src/core/whisper_transcriber.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+Local Whisper transcriber implementation
+"""
+import whisper
+import torch
+import pathlib
+import time
+from typing import Optional, List
+from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
+from ..utils.config import AudioProcessingConfig
+from ..utils.errors import TranscriptionError, ModelLoadError
+class WhisperTranscriber(ITranscriber):
+    """Local Whisper transcriber implementation"""
+    def __init__(self, config: Optional[AudioProcessingConfig] = None):
+        self.config = config or AudioProcessingConfig()
+        self.model_cache = {}
+        self.device = self._setup_device()
+    def _setup_device(self) -> str:
+        """Setup and return the best available device"""
+        if torch.cuda.is_available():
+            return "cuda"
+        else:
+            return "cpu"
+    async def transcribe(
+        self,
+        audio_file_path: str,
+        model_size: str = "turbo",
+        language: Optional[str] = None,
+        enable_speaker_diarization: bool = False
+    ) -> TranscriptionResult:
+        """Transcribe audio using local Whisper model"""
+        try:
+            # Validate audio file
+            audio_path = pathlib.Path(audio_file_path)
+            if not audio_path.exists():
+                raise TranscriptionError(
+                    f"Audio file not found: {audio_file_path}",
+                    audio_file=audio_file_path
+                )
+            # Load model
+            model = self._load_model(model_size)
+            # Transcribe
+            start_time = time.time()
+            result = model.transcribe(
+                str(audio_path),
+                language=language,
+                verbose=False
+            )
+            processing_time = time.time() - start_time
+            # Convert to our format
+            segments = []
+            for seg in result.get("segments", []):
+                segments.append(TranscriptionSegment(
+                    start=seg["start"],
+                    end=seg["end"],
+                    text=seg["text"].strip(),
+                    confidence=seg.get("avg_logprob")
+                ))
+            return TranscriptionResult(
+                text=result.get("text", "").strip(),
+                segments=segments,
+                language=result.get("language", "unknown"),
+                model_used=model_size,
+                audio_duration=result.get("duration", 0),
+                processing_time=processing_time,
+                speaker_diarization_enabled=enable_speaker_diarization,
+                global_speaker_count=0,
+                error_message=None
+            )
+        except Exception as e:
+            raise TranscriptionError(
+                f"Whisper transcription failed: {str(e)}",
+                model=model_size,
+                audio_file=audio_file_path
+            )
+    def _load_model(self, model_size: str):
+        """Load Whisper model with caching"""
+        if model_size not in self.model_cache:
+            try:
+                print(f"📥 Loading Whisper model: {model_size}")
+                self.model_cache[model_size] = whisper.load_model(
+                    model_size,
+                    device=self.device
+                )
+            except Exception as e:
+                raise ModelLoadError(
+                    f"Failed to load model {model_size}: {str(e)}",
+                    model_name=model_size
+                )
+        return self.model_cache[model_size]
+    def get_supported_models(self) -> List[str]:
+        """Get list of supported model sizes"""
+        return list(self.config.whisper_models.keys())
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported language codes"""
+        return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]

src/deployment/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Deployment management for audio processing services
+"""
+from .modal_deployer import ModalDeployer
+from .endpoint_manager import EndpointManager
+__all__ = ["ModalDeployer", "EndpointManager"]

src/deployment/deployment_manager.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+Simplified deployment manager
+This replaces the complex deploy_endpoints.py with a cleaner interface
+"""
+import argparse
+import sys
+from typing import Optional
+from ..audio_processing.deployment import ModalDeployer, EndpointManager
+from ..audio_processing.utils.config import AudioProcessingConfig
+from ..audio_processing.utils.errors import DeploymentError
+class DeploymentManager:
+    """Simplified deployment manager for audio processing services"""
+    def __init__(self):
+        self.config = AudioProcessingConfig()
+        self.modal_deployer = ModalDeployer(self.config)
+        self.endpoint_manager = EndpointManager()
+    def deploy(self) -> bool:
+        """Deploy transcription service"""
+        try:
+            print("🚀 Starting deployment process...")
+            endpoint_url = self.modal_deployer.deploy_transcription_service()
+            if endpoint_url:
+                print(f"✅ Deployment successful!")
+                print(f"🌐 Endpoint URL: {endpoint_url}")
+                return True
+            else:
+                print("❌ Deployment failed: Could not get endpoint URL")
+                return False
+        except DeploymentError as e:
+            print(f"❌ Deployment failed: {e.message}")
+            if e.details:
+                print(f"📋 Details: {e.details}")
+            return False
+        except Exception as e:
+            print(f"❌ Unexpected deployment error: {str(e)}")
+            return False
+    def status(self) -> bool:
+        """Check deployment status"""
+        print("🔍 Checking deployment status...")
+        endpoints = self.endpoint_manager.list_endpoints()
+        if not endpoints:
+            print("❌ No endpoints configured")
+            return False
+        print(f"📋 Configured endpoints:")
+        for name, url in endpoints.items():
+            print(f"  • {name}: {url}")
+        # Check health
+        return self.modal_deployer.check_deployment_status()
+    def undeploy(self):
+        """Remove deployment configuration"""
+        print("🗑️ Removing deployment configuration...")
+        self.modal_deployer.undeploy_transcription_service()
+    def list_endpoints(self):
+        """List all configured endpoints"""
+        endpoints = self.endpoint_manager.list_endpoints()
+        if not endpoints:
+            print("📋 No endpoints configured")
+            return
+        print("📋 Configured endpoints:")
+        for name, url in endpoints.items():
+            health_status = "✅ Healthy" if self.endpoint_manager.check_endpoint_health(name) else "❌ Unhealthy"
+            print(f"  • {name}: {url} ({health_status})")
+    def set_endpoint(self, name: str, url: str):
+        """Manually set an endpoint"""
+        self.endpoint_manager.set_endpoint(name, url)
+    def remove_endpoint(self, name: str):
+        """Remove an endpoint"""
+        self.endpoint_manager.remove_endpoint(name)
+def main():
+    """Command line interface for deployment manager"""
+    parser = argparse.ArgumentParser(description="Audio Processing Deployment Manager")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Deploy command
+    subparsers.add_parser("deploy", help="Deploy transcription service to Modal")
+    # Status command
+    subparsers.add_parser("status", help="Check deployment status")
+    # Undeploy command
+    subparsers.add_parser("undeploy", help="Remove deployment configuration")
+    # List endpoints command
+    subparsers.add_parser("list", help="List all configured endpoints")
+    # Set endpoint command
+    set_parser = subparsers.add_parser("set", help="Set endpoint URL manually")
+    set_parser.add_argument("name", help="Endpoint name")
+    set_parser.add_argument("url", help="Endpoint URL")
+    # Remove endpoint command
+    remove_parser = subparsers.add_parser("remove", help="Remove endpoint")
+    remove_parser.add_argument("name", help="Endpoint name")
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+        return
+    manager = DeploymentManager()
+    try:
+        if args.command == "deploy":
+            success = manager.deploy()
+            sys.exit(0 if success else 1)
+        elif args.command == "status":
+            success = manager.status()
+            sys.exit(0 if success else 1)
+        elif args.command == "undeploy":
+            manager.undeploy()
+        elif args.command == "list":
+            manager.list_endpoints()
+        elif args.command == "set":
+            manager.set_endpoint(args.name, args.url)
+        elif args.command == "remove":
+            manager.remove_endpoint(args.name)
+    except KeyboardInterrupt:
+        print("\n⚠️ Operation cancelled by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"❌ Error: {str(e)}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

src/deployment/endpoint_manager.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+Endpoint manager for handling Modal endpoints
+"""
+import json
+import os
+from typing import Dict, Optional
+from ..utils.errors import ConfigurationError
+class EndpointManager:
+    """Manager for Modal endpoint configuration"""
+    def __init__(self, config_file: str = "endpoint_config.json"):
+        self.config_file = config_file
+        self._endpoints = self._load_endpoints()
+    def _load_endpoints(self) -> Dict[str, str]:
+        """Load endpoints from configuration file"""
+        if not os.path.exists(self.config_file):
+            return {}
+        try:
+            with open(self.config_file, 'r') as f:
+                return json.load(f)
+        except Exception as e:
+            print(f"⚠️ Failed to load endpoint configuration: {e}")
+            return {}
+    def save_endpoints(self):
+        """Save endpoints to configuration file"""
+        try:
+            with open(self.config_file, 'w') as f:
+                json.dump(self._endpoints, f, indent=2)
+            print(f"💾 Endpoint configuration saved to {self.config_file}")
+        except Exception as e:
+            raise ConfigurationError(f"Failed to save endpoint configuration: {e}")
+    def set_endpoint(self, name: str, url: str):
+        """Set endpoint URL"""
+        self._endpoints[name] = url
+        self.save_endpoints()
+        print(f"✅ Endpoint '{name}' set to: {url}")
+    def get_endpoint(self, name: str) -> Optional[str]:
+        """Get endpoint URL"""
+        return self._endpoints.get(name)
+    def remove_endpoint(self, name: str):
+        """Remove endpoint"""
+        if name in self._endpoints:
+            del self._endpoints[name]
+            self.save_endpoints()
+            print(f"🗑️ Endpoint '{name}' removed")
+        else:
+            print(f"⚠️ Endpoint '{name}' not found")
+    def list_endpoints(self) -> Dict[str, str]:
+        """List all endpoints"""
+        return self._endpoints.copy()
+    def check_endpoint_health(self, name: str) -> bool:
+        """Check if endpoint is healthy"""
+        url = self.get_endpoint(name)
+        if not url:
+            return False
+        try:
+            import requests
+            # Try a simple health check (adjust based on your endpoint)
+            health_url = url.replace("/transcribe", "/health")
+            response = requests.get(health_url, timeout=10)
+            return response.status_code == 200
+        except Exception:
+            return False

src/deployment/modal_deployer.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+Modal deployer for deploying transcription services
+"""
+import subprocess
+from typing import Optional
+from ..utils.config import AudioProcessingConfig
+from ..utils.errors import DeploymentError
+from .endpoint_manager import EndpointManager
+class ModalDeployer:
+    """Deployer for Modal transcription services"""
+    def __init__(self, config: Optional[AudioProcessingConfig] = None):
+        self.config = config or AudioProcessingConfig()
+        self.endpoint_manager = EndpointManager()
+    def deploy_transcription_service(self) -> Optional[str]:
+        """Deploy transcription service to Modal"""
+        print("🚀 Deploying transcription service to Modal...")
+        try:
+            # Deploy the Modal app
+            print("🚀 Running modal deploy command...")
+            result = subprocess.run(
+                ["modal", "deploy", "modal_config.py"],
+                capture_output=True,
+                text=True
+            )
+            if result.returncode == 0:
+                # Extract or construct endpoint URL
+                endpoint_url = self._extract_endpoint_url(result.stdout)
+                if endpoint_url:
+                    # Save endpoint configuration
+                    self.endpoint_manager.set_endpoint("transcribe_audio", endpoint_url)
+                    print(f"✅ Transcription service deployed: {endpoint_url}")
+                    return endpoint_url
+                else:
+                    print("⚠️ Could not extract endpoint URL from deployment output")
+                    return None
+            else:
+                raise DeploymentError(
+                    f"Modal deployment failed: {result.stderr}",
+                    service="transcription"
+                )
+        except FileNotFoundError:
+            raise DeploymentError(
+                "Modal CLI not found. Please install Modal: pip install modal",
+                service="transcription"
+            )
+        except Exception as e:
+            raise DeploymentError(
+                f"Failed to deploy transcription service: {str(e)}",
+                service="transcription"
+            )
+    def _extract_endpoint_url(self, output: str) -> Optional[str]:
+        """Extract endpoint URL from deployment output"""
+        # Look for URL in output
+        for line in output.split('\n'):
+            if 'https://' in line and 'modal.run' in line:
+                # Extract URL from line
+                parts = line.split()
+                for part in parts:
+                    if part.startswith('https://') and 'modal.run' in part:
+                        return part
+        # Fallback to constructed URL
+        return f"https://{self.config.modal_app_name}--transcribe-audio-endpoint.modal.run"
+    def check_deployment_status(self) -> bool:
+        """Check if transcription service is deployed and healthy"""
+        endpoint_url = self.endpoint_manager.get_endpoint("transcribe_audio")
+        if not endpoint_url:
+            print("❌ No transcription endpoint configured")
+            return False
+        if self.endpoint_manager.check_endpoint_health("transcribe_audio"):
+            print(f"✅ Transcription service is healthy: {endpoint_url}")
+            return True
+        else:
+            print(f"❌ Transcription service is not responding: {endpoint_url}")
+            return False
+    def undeploy_transcription_service(self):
+        """Remove transcription service endpoint"""
+        self.endpoint_manager.remove_endpoint("transcribe_audio")
+        print("🗑️ Transcription service endpoint removed from configuration")
+        print("💡 Note: The actual Modal deployment may still be active. Use 'modal app stop' to stop it.")

src/interfaces/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Interfaces for audio processing components
+"""
+from .transcriber import ITranscriber
+from .speaker_detector import ISpeakerDetector
+from .audio_splitter import IAudioSplitter
+from .audio_processor import IAudioProcessor, AudioSegment
+from .podcast_downloader import IPodcastDownloader, PodcastInfo, DownloadResult, PodcastPlatform
+from .speaker_manager import (
+    ISpeakerEmbeddingManager,
+    ISpeakerIdentificationService,
+    SpeakerEmbedding,
+    SpeakerSegment
+)
+__all__ = [
+    # Core interfaces
+    "ITranscriber",
+    "ISpeakerDetector",
+    "IAudioSplitter",
+    # New service interfaces
+    "IAudioProcessor",
+    "IPodcastDownloader",
+    "ISpeakerEmbeddingManager",
+    "ISpeakerIdentificationService",
+    # Data classes
+    "AudioSegment",
+    "PodcastInfo",
+    "DownloadResult",
+    "SpeakerEmbedding",
+    "SpeakerSegment",
+    # Enums
+    "PodcastPlatform"
+]

src/interfaces/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (828 Bytes). View file

src/interfaces/__pycache__/audio_processor.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

src/interfaces/__pycache__/audio_splitter.cpython-310.pyc ADDED Viewed

Binary file (1.84 kB). View file

src/interfaces/__pycache__/podcast_downloader.cpython-310.pyc ADDED Viewed

Binary file (2.63 kB). View file

src/interfaces/__pycache__/speaker_detector.cpython-310.pyc ADDED Viewed

Binary file (2.55 kB). View file

src/interfaces/__pycache__/speaker_manager.cpython-310.pyc ADDED Viewed

Binary file (4.26 kB). View file

src/interfaces/__pycache__/transcriber.cpython-310.pyc ADDED Viewed

Binary file (2.55 kB). View file

src/interfaces/audio_processor.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Audio processing interface definitions
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, Any, List, Tuple, Iterator, Optional
+from dataclasses import dataclass
+@dataclass
+class AudioSegment:
+    """Audio segment representation"""
+    start: float
+    end: float
+    file_path: str
+    duration: float
+class IAudioProcessor(ABC):
+    """Interface for audio processing operations"""
+    @abstractmethod
+    async def split_audio_by_silence(
+        self,
+        audio_path: str,
+        min_segment_length: float = 30.0,
+        min_silence_length: float = 1.0
+    ) -> List[AudioSegment]:
+        """Split audio file by silence detection"""
+        pass
+    @abstractmethod
+    async def process_audio_segment(
+        self,
+        segment: AudioSegment,
+        model_name: str = "turbo",
+        language: Optional[str] = None,
+        enable_speaker_diarization: bool = False
+    ) -> Dict[str, Any]:
+        """Process a single audio segment"""
+        pass
+    @abstractmethod
+    async def process_complete_audio(
+        self,
+        audio_path: str,
+        model_name: str = "turbo",
+        language: Optional[str] = None,
+        enable_speaker_diarization: bool = False,
+        min_segment_length: float = 30.0
+    ) -> Dict[str, Any]:
+        """Process complete audio file"""
+        pass

src/interfaces/audio_splitter.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Audio splitter interface definition
+"""
+from abc import ABC, abstractmethod
+from typing import Iterator, Tuple
+from dataclasses import dataclass
+@dataclass
+class AudioSegment:
+    """Audio segment data class"""
+    start: float
+    end: float
+    duration: float
+    def __post_init__(self):
+        if self.duration <= 0:
+            self.duration = self.end - self.start
+class IAudioSplitter(ABC):
+    """Interface for audio splitting"""
+    @abstractmethod
+    def split_audio(
+        self,
+        audio_path: str,
+        min_segment_length: float = 30.0,
+        min_silence_length: float = 1.0
+    ) -> Iterator[AudioSegment]:
+        """
+        Split audio into segments
+        Args:
+            audio_path: Path to audio file
+            min_segment_length: Minimum segment length in seconds
+            min_silence_length: Minimum silence length for splitting
+        Yields:
+            AudioSegment objects
+        """
+        pass
+    @abstractmethod
+    def get_audio_duration(self, audio_path: str) -> float:
+        """Get total duration of audio file"""
+        pass

src/interfaces/podcast_downloader.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+Podcast downloading interface definitions
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional, Tuple
+from dataclasses import dataclass
+from enum import Enum
+class PodcastPlatform(Enum):
+    """Podcast platform enumeration"""
+    APPLE = "apple"
+    XIAOYUZHOU = "xyz"
+    SPOTIFY = "spotify"
+    GENERIC = "generic"
+@dataclass
+class PodcastInfo:
+    """Podcast episode information"""
+    title: str
+    audio_url: str
+    episode_id: str
+    platform: PodcastPlatform
+    duration: Optional[float] = None
+    description: Optional[str] = None
+@dataclass
+class DownloadResult:
+    """Download operation result"""
+    success: bool
+    file_path: Optional[str]
+    podcast_info: Optional[PodcastInfo]
+    error_message: Optional[str] = None
+class IPodcastDownloader(ABC):
+    """Interface for podcast downloading operations"""
+    @abstractmethod
+    async def extract_podcast_info(self, url: str) -> PodcastInfo:
+        """Extract podcast information from URL"""
+        pass
+    @abstractmethod
+    async def download_podcast(
+        self,
+        url: str,
+        output_folder: str = "downloads",
+        convert_to_mp3: bool = False,
+        keep_original: bool = False
+    ) -> DownloadResult:
+        """Download podcast from URL"""
+        pass
+    @abstractmethod
+    def get_supported_platforms(self) -> list[PodcastPlatform]:
+        """Get list of supported platforms"""
+        pass
+    @abstractmethod
+    def can_handle_url(self, url: str) -> bool:
+        """Check if this downloader can handle the given URL"""
+        pass

src/interfaces/speaker_detector.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Speaker detector interface definition
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+import numpy as np
+@dataclass
+class SpeakerSegment:
+    """Speaker segment data class"""
+    start: float
+    end: float
+    speaker_id: str
+    confidence: Optional[float] = None
+@dataclass
+class SpeakerProfile:
+    """Speaker profile data class"""
+    speaker_id: str
+    embedding: np.ndarray
+    segments: List[SpeakerSegment]
+    total_duration: float
+class ISpeakerDetector(ABC):
+    """Interface for speaker detection and diarization"""
+    @abstractmethod
+    async def detect_speakers(
+        self,
+        audio_file_path: str,
+        audio_segments: Optional[List] = None
+    ) -> Dict[str, SpeakerProfile]:
+        """
+        Detect and identify speakers in audio
+        Args:
+            audio_file_path: Path to audio file
+            audio_segments: Optional pre-segmented audio
+        Returns:
+            Dictionary mapping speaker IDs to SpeakerProfile objects
+        """
+        pass
+    @abstractmethod
+    def map_to_global_speakers(
+        self,
+        local_speakers: Dict[str, SpeakerProfile],
+        source_file: str
+    ) -> Dict[str, str]:
+        """
+        Map local speakers to global speaker identities
+        Args:
+            local_speakers: Local speaker profiles
+            source_file: Source audio file path
+        Returns:
+            Mapping from local speaker ID to global speaker ID
+        """
+        pass
+    @abstractmethod
+    def get_speaker_summary(self) -> Dict:
+        """Get summary of all detected speakers"""
+        pass