richard-su commited on
Commit
b5df735
·
verified ·
1 Parent(s): c5a4957

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. src/__init__.py +13 -0
  2. src/__pycache__/__init__.cpython-310.pyc +0 -0
  3. src/__pycache__/app.cpython-310.pyc +0 -0
  4. src/adapters/__init__.py +13 -0
  5. src/adapters/__pycache__/__init__.cpython-310.pyc +0 -0
  6. src/adapters/__pycache__/local_adapter.cpython-310.pyc +0 -0
  7. src/adapters/__pycache__/modal_adapter.cpython-310.pyc +0 -0
  8. src/adapters/__pycache__/transcription_adapter_factory.cpython-310.pyc +0 -0
  9. src/adapters/local_adapter.py +93 -0
  10. src/adapters/modal_adapter.py +126 -0
  11. src/adapters/transcription_adapter_factory.py +77 -0
  12. src/api/__init__.py +5 -0
  13. src/api/__pycache__/__init__.cpython-310.pyc +0 -0
  14. src/api/__pycache__/transcription_api.cpython-310.pyc +0 -0
  15. src/api/transcription_api.py +112 -0
  16. src/app.py +169 -0
  17. src/config/__init__.py +5 -0
  18. src/config/__pycache__/__init__.cpython-310.pyc +0 -0
  19. src/config/__pycache__/config.cpython-310.pyc +0 -0
  20. src/config/__pycache__/modal_config.cpython-310.pyc +0 -0
  21. src/config/config.py +81 -0
  22. src/config/modal_config.py +210 -0
  23. src/core/__init__.py +29 -0
  24. src/core/__pycache__/__init__.cpython-310.pyc +0 -0
  25. src/core/__pycache__/audio_splitter.cpython-310.pyc +0 -0
  26. src/core/__pycache__/config.cpython-310.pyc +0 -0
  27. src/core/__pycache__/exceptions.cpython-310.pyc +0 -0
  28. src/core/__pycache__/speaker_diarization.cpython-310.pyc +0 -0
  29. src/core/__pycache__/whisper_transcriber.cpython-310.pyc +0 -0
  30. src/core/audio_splitter.py +90 -0
  31. src/core/config.py +150 -0
  32. src/core/exceptions.py +43 -0
  33. src/core/speaker_diarization.py +126 -0
  34. src/core/whisper_transcriber.py +113 -0
  35. src/deployment/__init__.py +8 -0
  36. src/deployment/deployment_manager.py +153 -0
  37. src/deployment/endpoint_manager.py +76 -0
  38. src/deployment/modal_deployer.py +97 -0
  39. src/interfaces/__init__.py +38 -0
  40. src/interfaces/__pycache__/__init__.cpython-310.pyc +0 -0
  41. src/interfaces/__pycache__/audio_processor.cpython-310.pyc +0 -0
  42. src/interfaces/__pycache__/audio_splitter.cpython-310.pyc +0 -0
  43. src/interfaces/__pycache__/podcast_downloader.cpython-310.pyc +0 -0
  44. src/interfaces/__pycache__/speaker_detector.cpython-310.pyc +0 -0
  45. src/interfaces/__pycache__/speaker_manager.cpython-310.pyc +0 -0
  46. src/interfaces/__pycache__/transcriber.cpython-310.pyc +0 -0
  47. src/interfaces/audio_processor.py +53 -0
  48. src/interfaces/audio_splitter.py +48 -0
  49. src/interfaces/podcast_downloader.py +66 -0
  50. src/interfaces/speaker_detector.py +71 -0
src/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PodcastMcpGradio - Podcast Processing and Analysis Framework
3
+
4
+ A comprehensive framework for podcast downloading, transcription, and analysis
5
+ with MCP (Model Context Protocol) integration and Gradio UI.
6
+ """
7
+
8
+ __version__ = "2.0.0"
9
+ __author__ = "PodcastMcpGradio Team"
10
+ __description__ = "Podcast Processing and Analysis Framework"
11
+
12
+ # Core modules will be imported as needed
13
+ __all__ = []
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (507 Bytes). View file
 
src/__pycache__/app.cpython-310.pyc ADDED
Binary file (5.02 kB). View file
 
src/adapters/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Adapters for different transcription backends
3
+ """
4
+
5
+ from .transcription_adapter_factory import TranscriptionAdapterFactory
6
+ from .local_adapter import LocalTranscriptionAdapter
7
+ from .modal_adapter import ModalTranscriptionAdapter
8
+
9
+ __all__ = [
10
+ "TranscriptionAdapterFactory",
11
+ "LocalTranscriptionAdapter",
12
+ "ModalTranscriptionAdapter"
13
+ ]
src/adapters/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (460 Bytes). View file
 
src/adapters/__pycache__/local_adapter.cpython-310.pyc ADDED
Binary file (3.21 kB). View file
 
src/adapters/__pycache__/modal_adapter.cpython-310.pyc ADDED
Binary file (3.77 kB). View file
 
src/adapters/__pycache__/transcription_adapter_factory.cpython-310.pyc ADDED
Binary file (2.44 kB). View file
 
src/adapters/local_adapter.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local transcription adapter for direct processing
3
+ """
4
+
5
+ import asyncio
6
+ from typing import List, Optional
7
+
8
+ from ..interfaces.transcriber import ITranscriber, TranscriptionResult
9
+ from ..utils.config import AudioProcessingConfig
10
+ from ..utils.errors import TranscriptionError
11
+
12
+
13
+ class LocalTranscriptionAdapter(ITranscriber):
14
+ """Adapter for local transcription processing"""
15
+
16
+ def __init__(self, config: Optional[AudioProcessingConfig] = None):
17
+ self.config = config or AudioProcessingConfig()
18
+
19
+ async def transcribe(
20
+ self,
21
+ audio_file_path: str,
22
+ model_size: str = "turbo",
23
+ language: Optional[str] = None,
24
+ enable_speaker_diarization: bool = False
25
+ ) -> TranscriptionResult:
26
+ """Transcribe audio using local processing"""
27
+
28
+ try:
29
+ # Use the new AudioProcessingService instead of old methods
30
+ from ..services.audio_processing_service import AudioProcessingService
31
+ from ..models.services import AudioProcessingRequest
32
+
33
+ print(f"🔄 Starting local transcription for: {audio_file_path}")
34
+ print(f"🚀 Running transcription with {model_size} model...")
35
+
36
+ # Create service and request
37
+ audio_service = AudioProcessingService()
38
+ request = AudioProcessingRequest(
39
+ audio_file_path=audio_file_path,
40
+ model_size=model_size,
41
+ language=language,
42
+ output_format="json",
43
+ enable_speaker_diarization=enable_speaker_diarization
44
+ )
45
+
46
+ # Process transcription
47
+ result = audio_service.transcribe_full_audio(request)
48
+
49
+ # Convert service result to adapter format
50
+ return self._convert_service_result(result)
51
+
52
+ except Exception as e:
53
+ raise TranscriptionError(
54
+ f"Local transcription failed: {str(e)}",
55
+ model=model_size,
56
+ audio_file=audio_file_path
57
+ )
58
+
59
+ def get_supported_models(self) -> List[str]:
60
+ """Get list of supported model sizes"""
61
+ return list(self.config.whisper_models.keys())
62
+
63
+ def get_supported_languages(self) -> List[str]:
64
+ """Get list of supported language codes"""
65
+ # This would normally come from Whisper's supported languages
66
+ return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
67
+
68
+ def _convert_service_result(self, service_result) -> TranscriptionResult:
69
+ """Convert service result format to TranscriptionResult"""
70
+ from ..interfaces.transcriber import TranscriptionSegment
71
+
72
+ # Extract segments from service result if available
73
+ segments = []
74
+ if hasattr(service_result, 'segments') and service_result.segments:
75
+ for seg in service_result.segments:
76
+ segments.append(TranscriptionSegment(
77
+ start=getattr(seg, 'start', 0),
78
+ end=getattr(seg, 'end', 0),
79
+ text=getattr(seg, 'text', ''),
80
+ speaker=getattr(seg, 'speaker', None)
81
+ ))
82
+
83
+ return TranscriptionResult(
84
+ text=getattr(service_result, 'text', ''),
85
+ segments=segments,
86
+ language=getattr(service_result, 'language_detected', 'unknown'),
87
+ model_used=getattr(service_result, 'model_used', 'unknown'),
88
+ audio_duration=getattr(service_result, 'audio_duration', 0),
89
+ processing_time=getattr(service_result, 'processing_time', 0),
90
+ speaker_diarization_enabled=getattr(service_result, 'speaker_diarization_enabled', False),
91
+ global_speaker_count=getattr(service_result, 'global_speaker_count', 0),
92
+ error_message=getattr(service_result, 'error_message', None)
93
+ )
src/adapters/modal_adapter.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modal transcription adapter for remote processing
3
+ """
4
+
5
+ import requests
6
+ import base64
7
+ import pathlib
8
+ from typing import List, Optional
9
+
10
+ from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
11
+ from ..utils.config import AudioProcessingConfig
12
+ from ..utils.errors import TranscriptionError
13
+
14
+
15
+ class ModalTranscriptionAdapter(ITranscriber):
16
+ """Adapter for Modal remote transcription processing"""
17
+
18
+ def __init__(self, config: Optional[AudioProcessingConfig] = None, endpoint_url: Optional[str] = None):
19
+ self.config = config or AudioProcessingConfig()
20
+ self.endpoint_url = endpoint_url
21
+
22
+ async def transcribe(
23
+ self,
24
+ audio_file_path: str,
25
+ model_size: str = "turbo",
26
+ language: Optional[str] = None,
27
+ enable_speaker_diarization: bool = False
28
+ ) -> TranscriptionResult:
29
+ """Transcribe audio using Modal endpoint"""
30
+
31
+ if not self.endpoint_url:
32
+ raise TranscriptionError(
33
+ "Modal endpoint URL not configured",
34
+ model=model_size,
35
+ audio_file=audio_file_path
36
+ )
37
+
38
+ try:
39
+ # Read and encode audio file
40
+ audio_path = pathlib.Path(audio_file_path)
41
+ if not audio_path.exists():
42
+ raise TranscriptionError(
43
+ f"Audio file not found: {audio_file_path}",
44
+ audio_file=audio_file_path
45
+ )
46
+
47
+ with open(audio_path, 'rb') as f:
48
+ audio_data = f.read()
49
+
50
+ audio_base64 = base64.b64encode(audio_data).decode('utf-8')
51
+
52
+ # Prepare request data
53
+ request_data = {
54
+ "audio_file_data": audio_base64,
55
+ "audio_file_name": audio_path.name,
56
+ "model_size": model_size,
57
+ "language": language,
58
+ "output_format": "json",
59
+ "enable_speaker_diarization": enable_speaker_diarization
60
+ }
61
+
62
+ print(f"🔄 Sending transcription request to Modal endpoint")
63
+ print(f"📁 File: {audio_file_path} ({len(audio_data) / (1024*1024):.2f} MB)")
64
+ print(f"🔧 Model: {model_size}, Speaker diarization: {enable_speaker_diarization}")
65
+
66
+ # Make request to Modal endpoint
67
+ response = requests.post(
68
+ self.endpoint_url,
69
+ json=request_data,
70
+ timeout=1800 # 30 minutes timeout
71
+ )
72
+
73
+ response.raise_for_status()
74
+ result = response.json()
75
+
76
+ print(f"✅ Modal transcription completed")
77
+
78
+ # Convert result to TranscriptionResult format
79
+ return self._convert_modal_result(result)
80
+
81
+ except requests.exceptions.RequestException as e:
82
+ raise TranscriptionError(
83
+ f"Failed to call Modal endpoint: {str(e)}",
84
+ model=model_size,
85
+ audio_file=audio_file_path
86
+ )
87
+ except Exception as e:
88
+ raise TranscriptionError(
89
+ f"Modal transcription failed: {str(e)}",
90
+ model=model_size,
91
+ audio_file=audio_file_path
92
+ )
93
+
94
+ def get_supported_models(self) -> List[str]:
95
+ """Get list of supported model sizes"""
96
+ return list(self.config.whisper_models.keys())
97
+
98
+ def get_supported_languages(self) -> List[str]:
99
+ """Get list of supported language codes"""
100
+ return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
101
+
102
+ def _convert_modal_result(self, modal_result: dict) -> TranscriptionResult:
103
+ """Convert Modal result format to TranscriptionResult"""
104
+
105
+ # Extract segments if available
106
+ segments = []
107
+ if "segments" in modal_result:
108
+ for seg in modal_result["segments"]:
109
+ segments.append(TranscriptionSegment(
110
+ start=seg.get("start", 0),
111
+ end=seg.get("end", 0),
112
+ text=seg.get("text", ""),
113
+ speaker=seg.get("speaker")
114
+ ))
115
+
116
+ return TranscriptionResult(
117
+ text=modal_result.get("text", ""),
118
+ segments=segments,
119
+ language=modal_result.get("language_detected", "unknown"),
120
+ model_used=modal_result.get("model_used", "unknown"),
121
+ audio_duration=modal_result.get("audio_duration", 0),
122
+ processing_time=modal_result.get("processing_time", 0),
123
+ speaker_diarization_enabled=modal_result.get("speaker_diarization_enabled", False),
124
+ global_speaker_count=modal_result.get("global_speaker_count", 0),
125
+ error_message=modal_result.get("error_message")
126
+ )
src/adapters/transcription_adapter_factory.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Factory for creating transcription adapters
3
+ """
4
+
5
+ import os
6
+ from typing import Optional
7
+
8
+ from ..interfaces.transcriber import ITranscriber
9
+ from ..utils.config import AudioProcessingConfig
10
+ from ..utils.errors import ConfigurationError
11
+ from .local_adapter import LocalTranscriptionAdapter
12
+ from .modal_adapter import ModalTranscriptionAdapter
13
+
14
+
15
+ class TranscriptionAdapterFactory:
16
+ """Factory for creating appropriate transcription adapters"""
17
+
18
+ @staticmethod
19
+ def create_adapter(
20
+ deployment_mode: str = "auto",
21
+ config: Optional[AudioProcessingConfig] = None,
22
+ endpoint_url: Optional[str] = None
23
+ ) -> ITranscriber:
24
+ """
25
+ Create transcription adapter based on deployment mode
26
+
27
+ Args:
28
+ deployment_mode: "local", "modal", or "auto"
29
+ config: Configuration object
30
+ endpoint_url: Modal endpoint URL (for modal/auto mode)
31
+
32
+ Returns:
33
+ ITranscriber: Appropriate transcription adapter
34
+ """
35
+
36
+ config = config or AudioProcessingConfig()
37
+
38
+ # Auto mode: decide based on environment and endpoint availability
39
+ if deployment_mode == "auto":
40
+ if endpoint_url:
41
+ print(f"🌐 Auto mode: Using Modal adapter with endpoint {endpoint_url}")
42
+ return ModalTranscriptionAdapter(config=config, endpoint_url=endpoint_url)
43
+ else:
44
+ print(f"🏠 Auto mode: Using Local adapter (no endpoint configured)")
45
+ return LocalTranscriptionAdapter(config=config)
46
+
47
+ # Explicit local mode
48
+ elif deployment_mode == "local":
49
+ print(f"🏠 Using Local transcription adapter")
50
+ return LocalTranscriptionAdapter(config=config)
51
+
52
+ # Explicit modal mode
53
+ elif deployment_mode == "modal":
54
+ if not endpoint_url:
55
+ raise ConfigurationError(
56
+ "Modal endpoint URL is required for modal mode",
57
+ config_key="endpoint_url"
58
+ )
59
+ print(f"🌐 Using Modal transcription adapter with endpoint {endpoint_url}")
60
+ return ModalTranscriptionAdapter(config=config, endpoint_url=endpoint_url)
61
+
62
+ else:
63
+ raise ConfigurationError(
64
+ f"Unsupported deployment mode: {deployment_mode}. Use 'local', 'modal', or 'auto'",
65
+ config_key="deployment_mode"
66
+ )
67
+
68
+ @staticmethod
69
+ def _detect_deployment_mode() -> str:
70
+ """Auto-detect deployment mode based on environment"""
71
+ import os
72
+
73
+ # Check if running in Modal environment
74
+ if os.environ.get("MODAL_TASK_ID"):
75
+ return "local" # We're inside Modal, use local processing
76
+ else:
77
+ return "modal" # We're outside Modal, use remote endpoint
src/api/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ API Module - External interfaces and endpoints
3
+ """
4
+
5
+ __all__ = []
src/api/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (223 Bytes). View file
 
src/api/__pycache__/transcription_api.cpython-310.pyc ADDED
Binary file (3.27 kB). View file
 
src/api/transcription_api.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Transcription API module
3
+ """
4
+
5
+ import os
6
+ from typing import Optional, Dict, Any
7
+
8
+ from ..adapters import TranscriptionAdapterFactory
9
+ from ..services import TranscriptionService
10
+ from ..core import FFmpegAudioSplitter
11
+ from ..utils import AudioProcessingConfig, AudioProcessingError
12
+
13
+
14
+ class TranscriptionAPI:
15
+ """High-level API for transcription operations"""
16
+
17
+ def __init__(self, config: Optional[AudioProcessingConfig] = None):
18
+ self.config = config or AudioProcessingConfig()
19
+ self.transcription_service = None
20
+ self._initialize_service()
21
+
22
+ def _initialize_service(self):
23
+ """Initialize transcription service with appropriate adapter"""
24
+ try:
25
+ # Get endpoint URL from config file if available
26
+ endpoint_url = self._get_endpoint_url()
27
+
28
+ # Create appropriate adapter
29
+ transcriber = TranscriptionAdapterFactory.create_adapter(
30
+ deployment_mode="auto",
31
+ config=self.config,
32
+ endpoint_url=endpoint_url
33
+ )
34
+
35
+ # Create audio splitter
36
+ audio_splitter = FFmpegAudioSplitter()
37
+
38
+ # Create transcription service
39
+ self.transcription_service = TranscriptionService(
40
+ transcriber=transcriber,
41
+ audio_splitter=audio_splitter,
42
+ speaker_detector=None, # TODO: Add speaker detector when implemented
43
+ config=self.config
44
+ )
45
+
46
+ except Exception as e:
47
+ print(f"⚠️ Failed to initialize transcription service: {e}")
48
+ raise AudioProcessingError(f"Service initialization failed: {e}")
49
+
50
+ def _get_endpoint_url(self) -> Optional[str]:
51
+ """Get Modal endpoint URL from configuration"""
52
+ try:
53
+ import json
54
+ config_file = "endpoint_config.json"
55
+ if os.path.exists(config_file):
56
+ with open(config_file, 'r') as f:
57
+ config = json.load(f)
58
+ return config.get("transcribe_audio")
59
+ except Exception:
60
+ pass
61
+ return None
62
+
63
+ async def transcribe_audio_file(
64
+ self,
65
+ audio_file_path: str,
66
+ model_size: str = "turbo",
67
+ language: Optional[str] = None,
68
+ output_format: str = "srt",
69
+ enable_speaker_diarization: bool = False
70
+ ) -> Dict[str, Any]:
71
+ """Transcribe audio file using the configured service"""
72
+
73
+ if not self.transcription_service:
74
+ raise AudioProcessingError("Transcription service not initialized")
75
+
76
+ return await self.transcription_service.transcribe_audio_file(
77
+ audio_file_path=audio_file_path,
78
+ model_size=model_size,
79
+ language=language,
80
+ output_format=output_format,
81
+ enable_speaker_diarization=enable_speaker_diarization
82
+ )
83
+
84
+
85
+ # Create global API instance
86
+ _api_instance = None
87
+
88
+ def get_transcription_api() -> TranscriptionAPI:
89
+ """Get global transcription API instance"""
90
+ global _api_instance
91
+ if _api_instance is None:
92
+ _api_instance = TranscriptionAPI()
93
+ return _api_instance
94
+
95
+ async def transcribe_audio_adaptive_sync(
96
+ audio_file_path: str,
97
+ model_size: str = "turbo",
98
+ language: str = None,
99
+ output_format: str = "srt",
100
+ enable_speaker_diarization: bool = False
101
+ ) -> Dict[str, Any]:
102
+ """
103
+ Adaptive transcription function that routes to appropriate backend
104
+ """
105
+ api = get_transcription_api()
106
+ return await api.transcribe_audio_file(
107
+ audio_file_path=audio_file_path,
108
+ model_size=model_size,
109
+ language=language,
110
+ output_format=output_format,
111
+ enable_speaker_diarization=enable_speaker_diarization
112
+ )
src/app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI + Gradio + FastMCP MCP server main entry point
2
+
3
+ import modal
4
+ from contextlib import asynccontextmanager
5
+ from fastapi import FastAPI
6
+ from gradio.routes import mount_gradio_app
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import uvicorn
10
+ from mcp.server.fastmcp import FastMCP
11
+
12
+ # Import modules
13
+ from .tools import mcp_tools # Import the module, not get_mcp_server function
14
+ from .ui.gradio_ui import create_gradio_interface
15
+ from .config.config import is_modal_mode, is_local_mode
16
+
17
+ # Always import modal config since this module might be imported in modal context
18
+ try:
19
+ from .config.modal_config import app, image, volume, cache_dir, secrets
20
+ _modal_available = True
21
+ except ImportError:
22
+ _modal_available = False
23
+
24
+ # ==================== Application Creation Function ====================
25
+
26
+ def create_app():
27
+ """Create and return complete Gradio + MCP application"""
28
+
29
+ print("🚀 Starting Gradio + FastMCP server")
30
+
31
+ # Create FastMCP server with new tools
32
+ mcp = FastMCP("Podcast MCP")
33
+
34
+ # Register tools using the new service architecture
35
+ @mcp.tool(description="Transcribe audio files to text using Whisper model with speaker diarization support")
36
+ async def transcribe_audio_file_tool(
37
+ audio_file_path: str,
38
+ model_size: str = "turbo",
39
+ language: str = None,
40
+ output_format: str = "srt",
41
+ enable_speaker_diarization: bool = False
42
+ ):
43
+ return await mcp_tools.transcribe_audio_file(
44
+ audio_file_path, model_size, language, output_format, enable_speaker_diarization
45
+ )
46
+
47
+ @mcp.tool(description="Download Apple Podcast audio files")
48
+ async def download_apple_podcast_tool(url: str):
49
+ return await mcp_tools.download_apple_podcast(url)
50
+
51
+ @mcp.tool(description="Download XiaoYuZhou podcast audio files")
52
+ async def download_xyz_podcast_tool(url: str):
53
+ return await mcp_tools.download_xyz_podcast(url)
54
+
55
+ @mcp.tool(description="Scan directory for MP3 audio files")
56
+ async def get_mp3_files_tool(directory: str):
57
+ return await mcp_tools.get_mp3_files(directory)
58
+
59
+ @mcp.tool(description="Get basic file information")
60
+ async def get_file_info_tool(file_path: str):
61
+ return await mcp_tools.get_file_info(file_path)
62
+
63
+ @mcp.tool(description="Read text file content in segments")
64
+ async def read_text_file_segments_tool(
65
+ file_path: str,
66
+ chunk_size: int = 65536,
67
+ start_position: int = 0
68
+ ):
69
+ return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
70
+
71
+ # Create FastAPI wrapper
72
+ fastapi_wrapper = FastAPI(
73
+ title="Modal AudioTranscriber MCP",
74
+ description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
75
+ version="1.0.0",
76
+ lifespan=lambda app: mcp.session_manager.run()
77
+ )
78
+
79
+ # Get FastMCP's streamable HTTP app
80
+ mcp_app = mcp.streamable_http_app()
81
+
82
+ # Mount FastMCP application to /api path
83
+ fastapi_wrapper.mount("/api", mcp_app)
84
+
85
+ # Create Gradio interface
86
+ ui_app = create_gradio_interface()
87
+
88
+ # Use Gradio's standard mounting approach
89
+ final_app = mount_gradio_app(
90
+ app=fastapi_wrapper,
91
+ blocks=ui_app,
92
+ path="/",
93
+ app_kwargs={
94
+ "docs_url": "/docs",
95
+ "redoc_url": "/redoc",
96
+ }
97
+ )
98
+
99
+ print("✅ Server startup completed")
100
+ print("🎨 Gradio UI: /")
101
+ print("🔧 MCP Streamable HTTP: /api/mcp")
102
+ print(f"📝 Server name: {mcp.name}")
103
+
104
+ return final_app
105
+
106
+ # ==================== Modal Deployment Configuration ====================
107
+
108
+ # Create a separate Modal app for the Gradio interface
109
+ if _modal_available:
110
+ gradio_mcp_app = modal.App(name="gradio-mcp-ui")
111
+
112
+ @gradio_mcp_app.function(
113
+ image=image,
114
+ cpu=2, # Adequate CPU for UI operations
115
+ memory=4096, # 4GB memory for stable UI performance
116
+ max_containers=5, # Reduced to control resource usage
117
+ min_containers=1, # Keep minimum containers for faster response
118
+ scaledown_window=600, # 20 minutes before scaling down
119
+ timeout=1800, # 30 minutes timeout to prevent preemption
120
+ volumes={cache_dir: volume},
121
+ secrets=secrets,
122
+ )
123
+ @modal.concurrent(max_inputs=100)
124
+ @modal.asgi_app()
125
+ def app_entry():
126
+ """Modal deployment function - create and return complete Gradio + MCP application"""
127
+ return create_app()
128
+
129
+ # ==================== Main Entry Point ====================
130
+
131
+ def main():
132
+ """Main entry point for all deployment modes"""
133
+
134
+ if is_modal_mode():
135
+ print("☁️ Modal mode: Use 'modal deploy src.app::gradio_mcp_app'")
136
+ return None
137
+ else:
138
+ print("🏠 Starting in local mode")
139
+ print("💡 GPU functions will be routed to Modal endpoints")
140
+
141
+ app = create_app()
142
+ return app
143
+
144
+ def run_local():
145
+ """Run local server with uvicorn (for direct execution)"""
146
+ app = main()
147
+ if app:
148
+ uvicorn.run(
149
+ app,
150
+ host="0.0.0.0",
151
+ port=8000,
152
+ reload=False
153
+ )
154
+
155
+ # ==================== Hugging Face Spaces Support ====================
156
+
157
+ # For Hugging Face Spaces, directly create the app
158
+ def get_app():
159
+ """Get app instance for HF Spaces"""
160
+ if "DEPLOYMENT_MODE" not in os.environ:
161
+ os.environ["DEPLOYMENT_MODE"] = "local"
162
+ return main()
163
+
164
+ # Create app for HF Spaces when imported
165
+ if __name__ != "__main__":
166
+ app = get_app()
167
+
168
+ if __name__ == "__main__":
169
+ run_local()
src/config/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ Config Module - Configuration management
3
+ """
4
+
5
+ __all__ = []
src/config/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (220 Bytes). View file
 
src/config/__pycache__/config.cpython-310.pyc ADDED
Binary file (2.78 kB). View file
 
src/config/__pycache__/modal_config.cpython-310.pyc ADDED
Binary file (4.98 kB). View file
 
src/config/config.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deployment configuration for Gradio + MCP Server
3
+ Supports two deployment modes:
4
+ 1. Local mode: Gradio runs locally, GPU functions call Modal endpoints
5
+ 2. Modal mode: Gradio runs on Modal, GPU functions run locally on Modal
6
+ """
7
+
8
+ import os
9
+ from enum import Enum
10
+ from typing import Optional
11
+
12
+ class DeploymentMode(Enum):
13
+ LOCAL = "local" # Local Gradio + Remote GPU (Modal endpoints)
14
+ MODAL = "modal" # Modal Gradio + Local GPU (Modal functions)
15
+
16
+ # Get deployment mode from environment variable
17
+ DEPLOYMENT_MODE = DeploymentMode(os.getenv("DEPLOYMENT_MODE", "local"))
18
+
19
+ # Modal endpoints configuration
20
+ MODAL_APP_NAME = "gradio-mcp-server"
21
+
22
+ # Endpoint URLs (will be set when deployed)
23
+ ENDPOINTS = {
24
+ "transcribe_audio": None, # Will be filled with actual endpoint URL
25
+ }
26
+
27
+ def get_deployment_mode() -> DeploymentMode:
28
+ """Get current deployment mode"""
29
+ return DEPLOYMENT_MODE
30
+
31
+ def is_local_mode() -> bool:
32
+ """Check if running in local mode"""
33
+ return DEPLOYMENT_MODE == DeploymentMode.LOCAL
34
+
35
+ def is_modal_mode() -> bool:
36
+ """Check if running in modal mode"""
37
+ return DEPLOYMENT_MODE == DeploymentMode.MODAL
38
+
39
+ def set_endpoint_url(endpoint_name: str, url: str):
40
+ """Set endpoint URL for local mode"""
41
+ global ENDPOINTS
42
+ ENDPOINTS[endpoint_name] = url
43
+
44
+ def get_endpoint_url(endpoint_name: str) -> Optional[str]:
45
+ """Get endpoint URL for local mode"""
46
+ return ENDPOINTS.get(endpoint_name)
47
+
48
+ def get_transcribe_endpoint_url() -> Optional[str]:
49
+ """Get transcription endpoint URL"""
50
+ return get_endpoint_url("transcribe_audio")
51
+
52
+ # Environment-specific cache directory
53
+ def get_cache_dir() -> str:
54
+ """Get cache directory based on deployment mode"""
55
+ if is_modal_mode():
56
+ return "/root/cache"
57
+ else:
58
+ # Local mode - use user's home directory
59
+ home_dir = os.path.expanduser("~")
60
+ cache_dir = os.path.join(home_dir, ".gradio_mcp_cache")
61
+ os.makedirs(cache_dir, exist_ok=True)
62
+ return cache_dir
63
+
64
+ # Auto-load endpoint configuration in local mode
65
+ if is_local_mode():
66
+ import json
67
+ config_file = "endpoint_config.json"
68
+ if os.path.exists(config_file):
69
+ try:
70
+ with open(config_file, 'r') as f:
71
+ config = json.load(f)
72
+ for endpoint_name, url in config.items():
73
+ set_endpoint_url(endpoint_name, url)
74
+ print(f"✅ Loaded endpoint configuration from {config_file}")
75
+ except Exception as e:
76
+ print(f"⚠️ Failed to load endpoint configuration: {e}")
77
+ else:
78
+ print(f"⚠️ No endpoint configuration found. Run 'python deploy_endpoints.py deploy' first.")
79
+
80
+ print(f"🚀 Deployment mode: {DEPLOYMENT_MODE.value}")
81
+ print(f"📁 Cache directory: {get_cache_dir()}")
src/config/modal_config.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+ import os
3
+
4
+ # Create Modal application
5
+ app = modal.App(name="gradio-mcp-server")
6
+
7
+ # Try to get Hugging Face token from Modal secrets (required for speaker diarization)
8
+ try:
9
+ hf_secret = modal.Secret.from_name("huggingface-secret")
10
+ print("✅ Found Hugging Face secret configuration")
11
+ except Exception:
12
+ hf_secret = None
13
+ print("⚠️ Hugging Face secret not found, speaker diarization will be disabled")
14
+
15
+ # Create mounted volume
16
+ volume = modal.Volume.from_name("cache-volume", create_if_missing=True)
17
+ cache_dir = "/root/cache"
18
+
19
+ # Model preloading function
20
+ def download_models() -> None:
21
+ """Download and cache Whisper and speaker diarization models"""
22
+ import whisper
23
+ import os
24
+ from pathlib import Path
25
+
26
+ # Create model cache directory
27
+ model_cache_dir = Path("/model")
28
+ model_cache_dir.mkdir(exist_ok=True)
29
+
30
+ print("📥 Downloading Whisper turbo model...")
31
+ # Download and cache Whisper turbo model
32
+ whisper_model = whisper.load_model("turbo", download_root="/model")
33
+ print("✅ Whisper turbo model downloaded and cached")
34
+
35
+ # Download speaker diarization models if HF token is available
36
+ if os.environ.get("HF_TOKEN"):
37
+ try:
38
+ print("📥 Downloading speaker diarization models...")
39
+ from pyannote.audio import Pipeline, Model
40
+ from pyannote.audio.core.inference import Inference
41
+ import torch
42
+
43
+ # Set proper cache directory for pyannote
44
+ os.environ["PYANNOTE_CACHE"] = "/model/speaker-diarization"
45
+
46
+ # Download and cache speaker diarization pipeline
47
+ # This will automatically cache to the PYANNOTE_CACHE directory
48
+ pipeline = Pipeline.from_pretrained(
49
+ "pyannote/speaker-diarization-3.1",
50
+ use_auth_token=os.environ["HF_TOKEN"],
51
+ cache_dir="/model/speaker-diarization"
52
+ )
53
+
54
+ # Preload speaker embedding model for speaker identification
55
+ print("📥 Downloading speaker embedding model...")
56
+ embedding_model = Model.from_pretrained(
57
+ "pyannote/embedding",
58
+ use_auth_token=os.environ["HF_TOKEN"],
59
+ cache_dir="/model/speaker-embedding"
60
+ )
61
+
62
+ # Set device for models
63
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
+ embedding_model.to(device)
65
+ embedding_model.eval()
66
+
67
+ # Create inference object for embedding extraction
68
+ inference = Inference(embedding_model, window="whole")
69
+
70
+ # Verify the pipeline works
71
+ print("🧪 Testing speaker diarization pipeline...")
72
+
73
+ # Create a simple marker file to indicate successful download
74
+ import json
75
+ speaker_dir = Path("/model/speaker-diarization")
76
+ speaker_dir.mkdir(exist_ok=True, parents=True)
77
+
78
+ embedding_dir = Path("/model/speaker-embedding")
79
+ embedding_dir.mkdir(exist_ok=True, parents=True)
80
+
81
+ config = {
82
+ "model_name": "pyannote/speaker-diarization-3.1",
83
+ "embedding_model_name": "pyannote/embedding",
84
+ "cached_at": str(speaker_dir),
85
+ "embedding_cached_at": str(embedding_dir),
86
+ "cache_complete": True,
87
+ "embedding_cache_complete": True,
88
+ "pyannote_cache_env": "/model/speaker-diarization",
89
+ "device": str(device)
90
+ }
91
+ with open(speaker_dir / "download_complete.json", "w") as f:
92
+ json.dump(config, f)
93
+
94
+ print("✅ Speaker diarization and embedding models downloaded and cached")
95
+ except Exception as e:
96
+ print(f"⚠️ Failed to download speaker diarization models: {e}")
97
+ else:
98
+ print("⚠️ No HF_TOKEN found, skipping speaker diarization model download")
99
+
100
+ # Create image environment with model preloading
101
+ image = modal.Image.debian_slim(python_version="3.11").apt_install(
102
+ # Basic tools
103
+ "ffmpeg",
104
+ "wget",
105
+ "curl",
106
+ "unzip",
107
+ "gnupg2",
108
+ "git", # Required by Whisper
109
+ # Chrome dependencies
110
+ "libglib2.0-0",
111
+ "libnss3",
112
+ "libatk-bridge2.0-0",
113
+ "libdrm2",
114
+ "libxkbcommon0",
115
+ "libxcomposite1",
116
+ "libxdamage1",
117
+ "libxrandr2",
118
+ "libgbm1",
119
+ "libxss1",
120
+ "libasound2"
121
+ ).run_commands(
122
+ # Download and install Chrome directly (faster method)
123
+ "wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb",
124
+ "apt-get install -y ./google-chrome-stable_current_amd64.deb || apt-get install -y -f",
125
+ "rm google-chrome-stable_current_amd64.deb"
126
+ ).pip_install(
127
+ # Web frameworks and basic libraries
128
+ "gradio>=5.31.0",
129
+ "fastapi",
130
+ "pydantic",
131
+ "python-dotenv",
132
+ # MCP related
133
+ "mcp[cli]",
134
+ "fastmcp>=2.7.0",
135
+ "starlette",
136
+ # Network and parsing
137
+ "beautifulsoup4",
138
+ "selenium",
139
+ "requests",
140
+ # Whisper and audio processing related
141
+ "git+https://github.com/openai/whisper.git",
142
+ "ffmpeg-python",
143
+ "torchaudio==2.1.0",
144
+ "numpy<2",
145
+ # Audio processing dependencies
146
+ "librosa",
147
+ "soundfile",
148
+ # Other Whisper ecosystem dependencies
149
+ "dacite",
150
+ "jiwer",
151
+ "pandas",
152
+ "loguru==0.6.0",
153
+ # GraphQL client (if needed)
154
+ "gql[all]~=3.0.0a5",
155
+ # Speaker diarization related dependencies
156
+ "pyannote.audio==3.1.0",
157
+ # System monitoring
158
+ "psutil",
159
+ ).run_function(
160
+ download_models,
161
+ secrets=[hf_secret] if hf_secret else []
162
+ )
163
+
164
+ # Update file paths to reflect new structure
165
+ image = image.add_local_dir("../src", remote_path="/root/src")
166
+ secrets = [hf_secret] if hf_secret else []
167
+
168
+ # ==================== Modal Endpoints Configuration ====================
169
+
170
+ @app.function(
171
+ image=image,
172
+ volumes={cache_dir: volume},
173
+ cpu=4, # Increased CPU for better performance
174
+ memory=8192, # 8GB memory for stable transcription
175
+ gpu="A10G",
176
+ timeout=1800, # 30 minutes timeout for speaker diarization support
177
+ scaledown_window=40, # 15 minutes before scaling down
178
+ secrets=secrets,
179
+ )
180
+ @modal.fastapi_endpoint(method="POST", label="transcribe-audio-chunk-endpoint")
181
+ def transcribe_audio_chunk_endpoint(request_data: dict):
182
+ """FastAPI endpoint for transcribing a single audio chunk (for distributed processing)"""
183
+ import sys
184
+ sys.path.append('/root')
185
+
186
+ from src.services.modal_transcription_service import ModalTranscriptionService
187
+
188
+ modal_service = ModalTranscriptionService(cache_dir="/root/cache", use_direct_modal_calls=True)
189
+ return modal_service.process_chunk_request(request_data)
190
+
191
+ @app.function(
192
+ image=image,
193
+ cpu=2, # Increased CPU for better health check performance
194
+ memory=2048, # 2GB memory for stability
195
+ timeout=300, # 5 minutes timeout for health checks
196
+ scaledown_window=600, # 10 minutes before scaling down
197
+ secrets=secrets,
198
+ )
199
+ @modal.fastapi_endpoint(method="GET", label="health-check-endpoint")
200
+ def health_check_endpoint():
201
+ """Health check endpoint to verify service status"""
202
+ import sys
203
+ sys.path.append('/root')
204
+
205
+ from src.services.health_service import HealthService
206
+
207
+ health_service = HealthService()
208
+ return health_service.get_health_status()
209
+
210
+
src/core/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core components for application and audio processing
3
+ """
4
+
5
+ # Original core components
6
+ from .config import AppConfig, app_config, get_deployment_mode, is_local_mode, is_modal_mode
7
+ from .exceptions import AppError, ConfigError, ValidationError
8
+
9
+ # Audio processing core components
10
+ from .audio_splitter import FFmpegAudioSplitter
11
+ from .whisper_transcriber import WhisperTranscriber
12
+ from .speaker_diarization import PyannoteSpeikerDetector
13
+
14
+ __all__ = [
15
+ # Original core
16
+ "AppConfig",
17
+ "app_config",
18
+ "get_deployment_mode",
19
+ "is_local_mode",
20
+ "is_modal_mode",
21
+ "AppError",
22
+ "ConfigError",
23
+ "ValidationError",
24
+
25
+ # Audio processing core
26
+ "FFmpegAudioSplitter",
27
+ "WhisperTranscriber",
28
+ "PyannoteSpeikerDetector"
29
+ ]
src/core/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (713 Bytes). View file
 
src/core/__pycache__/audio_splitter.cpython-310.pyc ADDED
Binary file (2.32 kB). View file
 
src/core/__pycache__/config.cpython-310.pyc ADDED
Binary file (5.14 kB). View file
 
src/core/__pycache__/exceptions.cpython-310.pyc ADDED
Binary file (1.39 kB). View file
 
src/core/__pycache__/speaker_diarization.cpython-310.pyc ADDED
Binary file (3.78 kB). View file
 
src/core/__pycache__/whisper_transcriber.cpython-310.pyc ADDED
Binary file (3.41 kB). View file
 
src/core/audio_splitter.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio splitter implementation using FFmpeg
3
+ """
4
+
5
+ import re
6
+ from typing import Iterator
7
+ import ffmpeg
8
+
9
+ from ..interfaces.audio_splitter import IAudioSplitter, AudioSegment
10
+ from ..utils.errors import AudioSplittingError
11
+
12
+
13
+ class FFmpegAudioSplitter(IAudioSplitter):
14
+ """Audio splitter using FFmpeg's silence detection"""
15
+
16
+ def split_audio(
17
+ self,
18
+ audio_path: str,
19
+ min_segment_length: float = 30.0,
20
+ min_silence_length: float = 1.0
21
+ ) -> Iterator[AudioSegment]:
22
+ """Split audio by silence detection"""
23
+
24
+ try:
25
+ silence_end_re = re.compile(
26
+ r" silence_end: (?P<end>[0-9]+(\.?[0-9]*)) \| silence_duration: (?P<dur>[0-9]+(\.?[0-9]*))"
27
+ )
28
+
29
+ # Get audio duration
30
+ duration = self.get_audio_duration(audio_path)
31
+
32
+ # Use silence detection filter
33
+ reader = (
34
+ ffmpeg.input(str(audio_path))
35
+ .filter("silencedetect", n="-10dB", d=min_silence_length)
36
+ .output("pipe:", format="null")
37
+ .run_async(pipe_stderr=True)
38
+ )
39
+
40
+ cur_start = 0.0
41
+ segment_count = 0
42
+
43
+ while True:
44
+ line = reader.stderr.readline().decode("utf-8")
45
+ if not line:
46
+ break
47
+
48
+ match = silence_end_re.search(line)
49
+ if match:
50
+ silence_end, silence_dur = match.group("end"), match.group("dur")
51
+ split_at = float(silence_end) - (float(silence_dur) / 2)
52
+
53
+ if (split_at - cur_start) < min_segment_length:
54
+ continue
55
+
56
+ yield AudioSegment(
57
+ start=cur_start,
58
+ end=split_at,
59
+ duration=split_at - cur_start
60
+ )
61
+ cur_start = split_at
62
+ segment_count += 1
63
+
64
+ # Handle the last segment
65
+ if duration > cur_start:
66
+ yield AudioSegment(
67
+ start=cur_start,
68
+ end=duration,
69
+ duration=duration - cur_start
70
+ )
71
+ segment_count += 1
72
+
73
+ print(f"Audio split into {segment_count} segments")
74
+
75
+ except Exception as e:
76
+ raise AudioSplittingError(
77
+ f"Failed to split audio: {str(e)}",
78
+ audio_file=audio_path
79
+ )
80
+
81
+ def get_audio_duration(self, audio_path: str) -> float:
82
+ """Get total duration of audio file"""
83
+ try:
84
+ metadata = ffmpeg.probe(audio_path)
85
+ return float(metadata["format"]["duration"])
86
+ except Exception as e:
87
+ raise AudioSplittingError(
88
+ f"Failed to get audio duration: {str(e)}",
89
+ audio_file=audio_path
90
+ )
src/core/config.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management for PodcastMCP
3
+ """
4
+
5
+ import os
6
+ import json
7
+ from enum import Enum
8
+ from typing import Optional, Dict, Any
9
+ from pathlib import Path
10
+
11
+
12
+ class DeploymentMode(Enum):
13
+ """部署模式枚举"""
14
+ LOCAL = "local" # 本地Gradio + Modal GPU endpoints
15
+ MODAL = "modal" # 完全在Modal平台运行
16
+ HF_SPACES = "hf" # Hugging Face Spaces部署
17
+
18
+
19
+ class AppConfig:
20
+ """应用配置管理器"""
21
+
22
+ def __init__(self):
23
+ self._deployment_mode = self._detect_deployment_mode()
24
+ self._cache_dir = self._get_cache_directory()
25
+ self._endpoints = self._load_endpoints()
26
+
27
+ @property
28
+ def deployment_mode(self) -> DeploymentMode:
29
+ """获取当前部署模式"""
30
+ return self._deployment_mode
31
+
32
+ @property
33
+ def cache_dir(self) -> str:
34
+ """获取缓存目录"""
35
+ return self._cache_dir
36
+
37
+ @property
38
+ def is_local_mode(self) -> bool:
39
+ """是否为本地模式"""
40
+ return self._deployment_mode == DeploymentMode.LOCAL
41
+
42
+ @property
43
+ def is_modal_mode(self) -> bool:
44
+ """是否为Modal模式"""
45
+ return self._deployment_mode == DeploymentMode.MODAL
46
+
47
+ @property
48
+ def is_hf_spaces_mode(self) -> bool:
49
+ """是否为HF Spaces模式"""
50
+ return self._deployment_mode == DeploymentMode.HF_SPACES
51
+
52
+ def get_transcribe_endpoint_url(self) -> Optional[str]:
53
+ """获取转录endpoint URL"""
54
+ return self._endpoints.get("transcribe_audio")
55
+
56
+ def set_endpoint_url(self, service: str, url: str):
57
+ """设置endpoint URL"""
58
+ self._endpoints[service] = url
59
+ self._save_endpoints()
60
+
61
+ def _detect_deployment_mode(self) -> DeploymentMode:
62
+ """自动检测部署模式"""
63
+ # 检查环境变量
64
+ mode = os.environ.get("DEPLOYMENT_MODE", "").lower()
65
+ if mode == "modal":
66
+ return DeploymentMode.MODAL
67
+ elif mode == "hf":
68
+ return DeploymentMode.HF_SPACES
69
+
70
+ # 检查是否在HF Spaces环境
71
+ if os.environ.get("SPACE_ID") or os.environ.get("SPACES_ZERO_GPU"):
72
+ return DeploymentMode.HF_SPACES
73
+
74
+ # 检查是否在Modal环境
75
+ if os.environ.get("MODAL_TASK_ID") or os.environ.get("MODAL_IS_INSIDE_CONTAINER"):
76
+ return DeploymentMode.MODAL
77
+
78
+ # 默认为本地模式
79
+ return DeploymentMode.LOCAL
80
+
81
+ def _get_cache_directory(self) -> str:
82
+ """获取缓存目录路径"""
83
+ if self.is_modal_mode:
84
+ return "/root/cache"
85
+ else:
86
+ # 本地模式和HF Spaces使用用户缓存目录
87
+ home_dir = Path.home()
88
+ cache_dir = home_dir / ".gradio_mcp_cache"
89
+ cache_dir.mkdir(exist_ok=True)
90
+ return str(cache_dir)
91
+
92
+ def _load_endpoints(self) -> Dict[str, str]:
93
+ """加载endpoint配置"""
94
+ config_file = Path("endpoint_config.json")
95
+ if config_file.exists():
96
+ try:
97
+ with open(config_file, 'r') as f:
98
+ endpoints = json.load(f)
99
+ print(f"✅ Loaded endpoint configuration from {config_file}")
100
+ return endpoints
101
+ except Exception as e:
102
+ print(f"⚠️ Failed to load endpoint config: {e}")
103
+ else:
104
+ print("⚠️ No endpoint configuration found. Run deployment first.")
105
+
106
+ return {}
107
+
108
+ def _save_endpoints(self):
109
+ """保存endpoint配置"""
110
+ config_file = Path("endpoint_config.json")
111
+ try:
112
+ with open(config_file, 'w') as f:
113
+ json.dump(self._endpoints, f, indent=2)
114
+ print(f"💾 Endpoint configuration saved to {config_file}")
115
+ except Exception as e:
116
+ print(f"⚠️ Failed to save endpoint config: {e}")
117
+
118
+
119
+ # 全局配置实例
120
+ app_config = AppConfig()
121
+
122
+ # 向后兼容的函数接口
123
+ def get_deployment_mode() -> str:
124
+ """获取部署模式字符串"""
125
+ return app_config.deployment_mode.value
126
+
127
+ def is_local_mode() -> bool:
128
+ """是否为本地模式"""
129
+ return app_config.is_local_mode
130
+
131
+ def is_modal_mode() -> bool:
132
+ """是否为Modal模式"""
133
+ return app_config.is_modal_mode
134
+
135
+ def get_cache_dir() -> str:
136
+ """获取缓存目录"""
137
+ return app_config.cache_dir
138
+
139
+ def get_transcribe_endpoint_url() -> Optional[str]:
140
+ """获取转录endpoint URL"""
141
+ return app_config.get_transcribe_endpoint_url()
142
+
143
+ def set_endpoint_url(service: str, url: str):
144
+ """设置endpoint URL"""
145
+ app_config.set_endpoint_url(service, url)
146
+
147
+
148
+ # 打印配置信息
149
+ print(f"🚀 Deployment mode: {app_config.deployment_mode.value}")
150
+ print(f"📁 Cache directory: {app_config.cache_dir}")
src/core/exceptions.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom exceptions for PodcastMCP
3
+ """
4
+
5
+
6
+ class PodcastMCPError(Exception):
7
+ """PodcastMCP基础异常类"""
8
+ pass
9
+
10
+
11
+ class AppError(PodcastMCPError):
12
+ """应用程序异常"""
13
+ pass
14
+
15
+
16
+ class ConfigError(PodcastMCPError):
17
+ """配置相关异常"""
18
+ pass
19
+
20
+
21
+ class ValidationError(PodcastMCPError):
22
+ """验证相关异常"""
23
+ pass
24
+
25
+
26
+ class TranscriptionError(PodcastMCPError):
27
+ """转录相关异常"""
28
+ pass
29
+
30
+
31
+ class DeploymentError(PodcastMCPError):
32
+ """部署相关异常"""
33
+ pass
34
+
35
+
36
+ class FileNotFoundError(PodcastMCPError):
37
+ """文件未找到异常"""
38
+ pass
39
+
40
+
41
+ class EndpointError(PodcastMCPError):
42
+ """Endpoint相关异常"""
43
+ pass
src/core/speaker_diarization.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speaker diarization implementation using pyannote.audio
3
+ """
4
+
5
+ import os
6
+ import torch
7
+ from typing import Optional, List, Dict, Any
8
+
9
+ from ..interfaces.speaker_detector import ISpeakerDetector
10
+ from ..utils.config import AudioProcessingConfig
11
+ from ..utils.errors import SpeakerDiarizationError, ModelLoadError
12
+
13
+
14
+ class PyannoteSpeikerDetector(ISpeakerDetector):
15
+ """Speaker diarization using pyannote.audio"""
16
+
17
+ def __init__(self, config: Optional[AudioProcessingConfig] = None):
18
+ self.config = config or AudioProcessingConfig()
19
+ self.device = self._setup_device()
20
+ self.pipeline = None
21
+ self.auth_token = os.environ.get(self.config.hf_token_env_var)
22
+
23
+ if not self.auth_token:
24
+ print("⚠️ No Hugging Face token found. Speaker diarization will be disabled.")
25
+
26
+ def _setup_device(self) -> torch.device:
27
+ """Setup and return the best available device"""
28
+ if torch.cuda.is_available():
29
+ return torch.device("cuda")
30
+ else:
31
+ return torch.device("cpu")
32
+
33
+ async def detect_speakers(
34
+ self,
35
+ audio_file_path: str,
36
+ num_speakers: Optional[int] = None,
37
+ min_speakers: int = 1,
38
+ max_speakers: int = 10
39
+ ) -> Dict[str, Any]:
40
+ """Detect speakers in audio file"""
41
+
42
+ if not self.auth_token:
43
+ raise SpeakerDiarizationError(
44
+ "Speaker diarization requires Hugging Face token",
45
+ audio_file=audio_file_path
46
+ )
47
+
48
+ try:
49
+ # Load pipeline if not already loaded
50
+ if self.pipeline is None:
51
+ self.pipeline = self._load_pipeline()
52
+
53
+ # Perform diarization
54
+ diarization = self.pipeline(audio_file_path)
55
+
56
+ # Convert to our format
57
+ speakers = {}
58
+ segments = []
59
+
60
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
61
+ speaker_id = f"SPEAKER_{speaker.split('_')[-1].zfill(2)}"
62
+ segments.append({
63
+ "start": turn.start,
64
+ "end": turn.end,
65
+ "speaker": speaker_id
66
+ })
67
+
68
+ if speaker_id not in speakers:
69
+ speakers[speaker_id] = {
70
+ "id": speaker_id,
71
+ "total_time": 0.0,
72
+ "segments": []
73
+ }
74
+
75
+ speakers[speaker_id]["total_time"] += turn.end - turn.start
76
+ speakers[speaker_id]["segments"].append({
77
+ "start": turn.start,
78
+ "end": turn.end
79
+ })
80
+
81
+ return {
82
+ "speaker_count": len(speakers),
83
+ "speakers": speakers,
84
+ "segments": segments,
85
+ "audio_file": audio_file_path
86
+ }
87
+
88
+ except Exception as e:
89
+ raise SpeakerDiarizationError(
90
+ f"Speaker detection failed: {str(e)}",
91
+ audio_file=audio_file_path
92
+ )
93
+
94
+ def _load_pipeline(self):
95
+ """Load pyannote speaker diarization pipeline"""
96
+ try:
97
+ # Suppress warnings
98
+ import warnings
99
+ warnings.filterwarnings("ignore", category=UserWarning, module="pyannote")
100
+ warnings.filterwarnings("ignore", category=UserWarning, module="pytorch_lightning")
101
+ warnings.filterwarnings("ignore", category=FutureWarning, module="pytorch_lightning")
102
+
103
+ from pyannote.audio import Pipeline
104
+
105
+ print("📥 Loading speaker diarization pipeline...")
106
+ pipeline = Pipeline.from_pretrained(
107
+ self.config.speaker_diarization_model,
108
+ use_auth_token=self.auth_token
109
+ )
110
+ pipeline.to(self.device)
111
+
112
+ return pipeline
113
+
114
+ except Exception as e:
115
+ raise ModelLoadError(
116
+ f"Failed to load speaker diarization pipeline: {str(e)}",
117
+ model_name=self.config.speaker_diarization_model
118
+ )
119
+
120
+ def get_supported_models(self) -> List[str]:
121
+ """Get list of supported speaker diarization models"""
122
+ return [self.config.speaker_diarization_model]
123
+
124
+ def is_available(self) -> bool:
125
+ """Check if speaker diarization is available"""
126
+ return self.auth_token is not None
src/core/whisper_transcriber.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local Whisper transcriber implementation
3
+ """
4
+
5
+ import whisper
6
+ import torch
7
+ import pathlib
8
+ import time
9
+ from typing import Optional, List
10
+
11
+ from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
12
+ from ..utils.config import AudioProcessingConfig
13
+ from ..utils.errors import TranscriptionError, ModelLoadError
14
+
15
+
16
+ class WhisperTranscriber(ITranscriber):
17
+ """Local Whisper transcriber implementation"""
18
+
19
+ def __init__(self, config: Optional[AudioProcessingConfig] = None):
20
+ self.config = config or AudioProcessingConfig()
21
+ self.model_cache = {}
22
+ self.device = self._setup_device()
23
+
24
+ def _setup_device(self) -> str:
25
+ """Setup and return the best available device"""
26
+ if torch.cuda.is_available():
27
+ return "cuda"
28
+ else:
29
+ return "cpu"
30
+
31
+ async def transcribe(
32
+ self,
33
+ audio_file_path: str,
34
+ model_size: str = "turbo",
35
+ language: Optional[str] = None,
36
+ enable_speaker_diarization: bool = False
37
+ ) -> TranscriptionResult:
38
+ """Transcribe audio using local Whisper model"""
39
+
40
+ try:
41
+ # Validate audio file
42
+ audio_path = pathlib.Path(audio_file_path)
43
+ if not audio_path.exists():
44
+ raise TranscriptionError(
45
+ f"Audio file not found: {audio_file_path}",
46
+ audio_file=audio_file_path
47
+ )
48
+
49
+ # Load model
50
+ model = self._load_model(model_size)
51
+
52
+ # Transcribe
53
+ start_time = time.time()
54
+ result = model.transcribe(
55
+ str(audio_path),
56
+ language=language,
57
+ verbose=False
58
+ )
59
+ processing_time = time.time() - start_time
60
+
61
+ # Convert to our format
62
+ segments = []
63
+ for seg in result.get("segments", []):
64
+ segments.append(TranscriptionSegment(
65
+ start=seg["start"],
66
+ end=seg["end"],
67
+ text=seg["text"].strip(),
68
+ confidence=seg.get("avg_logprob")
69
+ ))
70
+
71
+ return TranscriptionResult(
72
+ text=result.get("text", "").strip(),
73
+ segments=segments,
74
+ language=result.get("language", "unknown"),
75
+ model_used=model_size,
76
+ audio_duration=result.get("duration", 0),
77
+ processing_time=processing_time,
78
+ speaker_diarization_enabled=enable_speaker_diarization,
79
+ global_speaker_count=0,
80
+ error_message=None
81
+ )
82
+
83
+ except Exception as e:
84
+ raise TranscriptionError(
85
+ f"Whisper transcription failed: {str(e)}",
86
+ model=model_size,
87
+ audio_file=audio_file_path
88
+ )
89
+
90
+ def _load_model(self, model_size: str):
91
+ """Load Whisper model with caching"""
92
+ if model_size not in self.model_cache:
93
+ try:
94
+ print(f"📥 Loading Whisper model: {model_size}")
95
+ self.model_cache[model_size] = whisper.load_model(
96
+ model_size,
97
+ device=self.device
98
+ )
99
+ except Exception as e:
100
+ raise ModelLoadError(
101
+ f"Failed to load model {model_size}: {str(e)}",
102
+ model_name=model_size
103
+ )
104
+
105
+ return self.model_cache[model_size]
106
+
107
+ def get_supported_models(self) -> List[str]:
108
+ """Get list of supported model sizes"""
109
+ return list(self.config.whisper_models.keys())
110
+
111
+ def get_supported_languages(self) -> List[str]:
112
+ """Get list of supported language codes"""
113
+ return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
src/deployment/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deployment management for audio processing services
3
+ """
4
+
5
+ from .modal_deployer import ModalDeployer
6
+ from .endpoint_manager import EndpointManager
7
+
8
+ __all__ = ["ModalDeployer", "EndpointManager"]
src/deployment/deployment_manager.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simplified deployment manager
3
+ This replaces the complex deploy_endpoints.py with a cleaner interface
4
+ """
5
+
6
+ import argparse
7
+ import sys
8
+ from typing import Optional
9
+
10
+ from ..audio_processing.deployment import ModalDeployer, EndpointManager
11
+ from ..audio_processing.utils.config import AudioProcessingConfig
12
+ from ..audio_processing.utils.errors import DeploymentError
13
+
14
+
15
+ class DeploymentManager:
16
+ """Simplified deployment manager for audio processing services"""
17
+
18
+ def __init__(self):
19
+ self.config = AudioProcessingConfig()
20
+ self.modal_deployer = ModalDeployer(self.config)
21
+ self.endpoint_manager = EndpointManager()
22
+
23
+ def deploy(self) -> bool:
24
+ """Deploy transcription service"""
25
+ try:
26
+ print("🚀 Starting deployment process...")
27
+ endpoint_url = self.modal_deployer.deploy_transcription_service()
28
+
29
+ if endpoint_url:
30
+ print(f"✅ Deployment successful!")
31
+ print(f"🌐 Endpoint URL: {endpoint_url}")
32
+ return True
33
+ else:
34
+ print("❌ Deployment failed: Could not get endpoint URL")
35
+ return False
36
+
37
+ except DeploymentError as e:
38
+ print(f"❌ Deployment failed: {e.message}")
39
+ if e.details:
40
+ print(f"📋 Details: {e.details}")
41
+ return False
42
+ except Exception as e:
43
+ print(f"❌ Unexpected deployment error: {str(e)}")
44
+ return False
45
+
46
+ def status(self) -> bool:
47
+ """Check deployment status"""
48
+ print("🔍 Checking deployment status...")
49
+
50
+ endpoints = self.endpoint_manager.list_endpoints()
51
+ if not endpoints:
52
+ print("❌ No endpoints configured")
53
+ return False
54
+
55
+ print(f"📋 Configured endpoints:")
56
+ for name, url in endpoints.items():
57
+ print(f" • {name}: {url}")
58
+
59
+ # Check health
60
+ return self.modal_deployer.check_deployment_status()
61
+
62
+ def undeploy(self):
63
+ """Remove deployment configuration"""
64
+ print("🗑️ Removing deployment configuration...")
65
+ self.modal_deployer.undeploy_transcription_service()
66
+
67
+ def list_endpoints(self):
68
+ """List all configured endpoints"""
69
+ endpoints = self.endpoint_manager.list_endpoints()
70
+
71
+ if not endpoints:
72
+ print("📋 No endpoints configured")
73
+ return
74
+
75
+ print("📋 Configured endpoints:")
76
+ for name, url in endpoints.items():
77
+ health_status = "✅ Healthy" if self.endpoint_manager.check_endpoint_health(name) else "❌ Unhealthy"
78
+ print(f" • {name}: {url} ({health_status})")
79
+
80
+ def set_endpoint(self, name: str, url: str):
81
+ """Manually set an endpoint"""
82
+ self.endpoint_manager.set_endpoint(name, url)
83
+
84
+ def remove_endpoint(self, name: str):
85
+ """Remove an endpoint"""
86
+ self.endpoint_manager.remove_endpoint(name)
87
+
88
+
89
+ def main():
90
+ """Command line interface for deployment manager"""
91
+ parser = argparse.ArgumentParser(description="Audio Processing Deployment Manager")
92
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
93
+
94
+ # Deploy command
95
+ subparsers.add_parser("deploy", help="Deploy transcription service to Modal")
96
+
97
+ # Status command
98
+ subparsers.add_parser("status", help="Check deployment status")
99
+
100
+ # Undeploy command
101
+ subparsers.add_parser("undeploy", help="Remove deployment configuration")
102
+
103
+ # List endpoints command
104
+ subparsers.add_parser("list", help="List all configured endpoints")
105
+
106
+ # Set endpoint command
107
+ set_parser = subparsers.add_parser("set", help="Set endpoint URL manually")
108
+ set_parser.add_argument("name", help="Endpoint name")
109
+ set_parser.add_argument("url", help="Endpoint URL")
110
+
111
+ # Remove endpoint command
112
+ remove_parser = subparsers.add_parser("remove", help="Remove endpoint")
113
+ remove_parser.add_argument("name", help="Endpoint name")
114
+
115
+ args = parser.parse_args()
116
+
117
+ if not args.command:
118
+ parser.print_help()
119
+ return
120
+
121
+ manager = DeploymentManager()
122
+
123
+ try:
124
+ if args.command == "deploy":
125
+ success = manager.deploy()
126
+ sys.exit(0 if success else 1)
127
+
128
+ elif args.command == "status":
129
+ success = manager.status()
130
+ sys.exit(0 if success else 1)
131
+
132
+ elif args.command == "undeploy":
133
+ manager.undeploy()
134
+
135
+ elif args.command == "list":
136
+ manager.list_endpoints()
137
+
138
+ elif args.command == "set":
139
+ manager.set_endpoint(args.name, args.url)
140
+
141
+ elif args.command == "remove":
142
+ manager.remove_endpoint(args.name)
143
+
144
+ except KeyboardInterrupt:
145
+ print("\n⚠️ Operation cancelled by user")
146
+ sys.exit(1)
147
+ except Exception as e:
148
+ print(f"❌ Error: {str(e)}")
149
+ sys.exit(1)
150
+
151
+
152
+ if __name__ == "__main__":
153
+ main()
src/deployment/endpoint_manager.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Endpoint manager for handling Modal endpoints
3
+ """
4
+
5
+ import json
6
+ import os
7
+ from typing import Dict, Optional
8
+
9
+ from ..utils.errors import ConfigurationError
10
+
11
+
12
+ class EndpointManager:
13
+ """Manager for Modal endpoint configuration"""
14
+
15
+ def __init__(self, config_file: str = "endpoint_config.json"):
16
+ self.config_file = config_file
17
+ self._endpoints = self._load_endpoints()
18
+
19
+ def _load_endpoints(self) -> Dict[str, str]:
20
+ """Load endpoints from configuration file"""
21
+ if not os.path.exists(self.config_file):
22
+ return {}
23
+
24
+ try:
25
+ with open(self.config_file, 'r') as f:
26
+ return json.load(f)
27
+ except Exception as e:
28
+ print(f"⚠️ Failed to load endpoint configuration: {e}")
29
+ return {}
30
+
31
+ def save_endpoints(self):
32
+ """Save endpoints to configuration file"""
33
+ try:
34
+ with open(self.config_file, 'w') as f:
35
+ json.dump(self._endpoints, f, indent=2)
36
+ print(f"💾 Endpoint configuration saved to {self.config_file}")
37
+ except Exception as e:
38
+ raise ConfigurationError(f"Failed to save endpoint configuration: {e}")
39
+
40
+ def set_endpoint(self, name: str, url: str):
41
+ """Set endpoint URL"""
42
+ self._endpoints[name] = url
43
+ self.save_endpoints()
44
+ print(f"✅ Endpoint '{name}' set to: {url}")
45
+
46
+ def get_endpoint(self, name: str) -> Optional[str]:
47
+ """Get endpoint URL"""
48
+ return self._endpoints.get(name)
49
+
50
+ def remove_endpoint(self, name: str):
51
+ """Remove endpoint"""
52
+ if name in self._endpoints:
53
+ del self._endpoints[name]
54
+ self.save_endpoints()
55
+ print(f"🗑️ Endpoint '{name}' removed")
56
+ else:
57
+ print(f"⚠️ Endpoint '{name}' not found")
58
+
59
+ def list_endpoints(self) -> Dict[str, str]:
60
+ """List all endpoints"""
61
+ return self._endpoints.copy()
62
+
63
+ def check_endpoint_health(self, name: str) -> bool:
64
+ """Check if endpoint is healthy"""
65
+ url = self.get_endpoint(name)
66
+ if not url:
67
+ return False
68
+
69
+ try:
70
+ import requests
71
+ # Try a simple health check (adjust based on your endpoint)
72
+ health_url = url.replace("/transcribe", "/health")
73
+ response = requests.get(health_url, timeout=10)
74
+ return response.status_code == 200
75
+ except Exception:
76
+ return False
src/deployment/modal_deployer.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modal deployer for deploying transcription services
3
+ """
4
+
5
+ import subprocess
6
+ from typing import Optional
7
+
8
+ from ..utils.config import AudioProcessingConfig
9
+ from ..utils.errors import DeploymentError
10
+ from .endpoint_manager import EndpointManager
11
+
12
+
13
+ class ModalDeployer:
14
+ """Deployer for Modal transcription services"""
15
+
16
+ def __init__(self, config: Optional[AudioProcessingConfig] = None):
17
+ self.config = config or AudioProcessingConfig()
18
+ self.endpoint_manager = EndpointManager()
19
+
20
+ def deploy_transcription_service(self) -> Optional[str]:
21
+ """Deploy transcription service to Modal"""
22
+
23
+ print("🚀 Deploying transcription service to Modal...")
24
+
25
+ try:
26
+ # Deploy the Modal app
27
+ print("🚀 Running modal deploy command...")
28
+ result = subprocess.run(
29
+ ["modal", "deploy", "modal_config.py"],
30
+ capture_output=True,
31
+ text=True
32
+ )
33
+
34
+ if result.returncode == 0:
35
+ # Extract or construct endpoint URL
36
+ endpoint_url = self._extract_endpoint_url(result.stdout)
37
+
38
+ if endpoint_url:
39
+ # Save endpoint configuration
40
+ self.endpoint_manager.set_endpoint("transcribe_audio", endpoint_url)
41
+ print(f"✅ Transcription service deployed: {endpoint_url}")
42
+ return endpoint_url
43
+ else:
44
+ print("⚠️ Could not extract endpoint URL from deployment output")
45
+ return None
46
+ else:
47
+ raise DeploymentError(
48
+ f"Modal deployment failed: {result.stderr}",
49
+ service="transcription"
50
+ )
51
+
52
+ except FileNotFoundError:
53
+ raise DeploymentError(
54
+ "Modal CLI not found. Please install Modal: pip install modal",
55
+ service="transcription"
56
+ )
57
+ except Exception as e:
58
+ raise DeploymentError(
59
+ f"Failed to deploy transcription service: {str(e)}",
60
+ service="transcription"
61
+ )
62
+
63
+ def _extract_endpoint_url(self, output: str) -> Optional[str]:
64
+ """Extract endpoint URL from deployment output"""
65
+
66
+ # Look for URL in output
67
+ for line in output.split('\n'):
68
+ if 'https://' in line and 'modal.run' in line:
69
+ # Extract URL from line
70
+ parts = line.split()
71
+ for part in parts:
72
+ if part.startswith('https://') and 'modal.run' in part:
73
+ return part
74
+
75
+ # Fallback to constructed URL
76
+ return f"https://{self.config.modal_app_name}--transcribe-audio-endpoint.modal.run"
77
+
78
+ def check_deployment_status(self) -> bool:
79
+ """Check if transcription service is deployed and healthy"""
80
+
81
+ endpoint_url = self.endpoint_manager.get_endpoint("transcribe_audio")
82
+ if not endpoint_url:
83
+ print("❌ No transcription endpoint configured")
84
+ return False
85
+
86
+ if self.endpoint_manager.check_endpoint_health("transcribe_audio"):
87
+ print(f"✅ Transcription service is healthy: {endpoint_url}")
88
+ return True
89
+ else:
90
+ print(f"❌ Transcription service is not responding: {endpoint_url}")
91
+ return False
92
+
93
+ def undeploy_transcription_service(self):
94
+ """Remove transcription service endpoint"""
95
+ self.endpoint_manager.remove_endpoint("transcribe_audio")
96
+ print("🗑️ Transcription service endpoint removed from configuration")
97
+ print("💡 Note: The actual Modal deployment may still be active. Use 'modal app stop' to stop it.")
src/interfaces/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interfaces for audio processing components
3
+ """
4
+
5
+ from .transcriber import ITranscriber
6
+ from .speaker_detector import ISpeakerDetector
7
+ from .audio_splitter import IAudioSplitter
8
+ from .audio_processor import IAudioProcessor, AudioSegment
9
+ from .podcast_downloader import IPodcastDownloader, PodcastInfo, DownloadResult, PodcastPlatform
10
+ from .speaker_manager import (
11
+ ISpeakerEmbeddingManager,
12
+ ISpeakerIdentificationService,
13
+ SpeakerEmbedding,
14
+ SpeakerSegment
15
+ )
16
+
17
+ __all__ = [
18
+ # Core interfaces
19
+ "ITranscriber",
20
+ "ISpeakerDetector",
21
+ "IAudioSplitter",
22
+
23
+ # New service interfaces
24
+ "IAudioProcessor",
25
+ "IPodcastDownloader",
26
+ "ISpeakerEmbeddingManager",
27
+ "ISpeakerIdentificationService",
28
+
29
+ # Data classes
30
+ "AudioSegment",
31
+ "PodcastInfo",
32
+ "DownloadResult",
33
+ "SpeakerEmbedding",
34
+ "SpeakerSegment",
35
+
36
+ # Enums
37
+ "PodcastPlatform"
38
+ ]
src/interfaces/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (828 Bytes). View file
 
src/interfaces/__pycache__/audio_processor.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
src/interfaces/__pycache__/audio_splitter.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
src/interfaces/__pycache__/podcast_downloader.cpython-310.pyc ADDED
Binary file (2.63 kB). View file
 
src/interfaces/__pycache__/speaker_detector.cpython-310.pyc ADDED
Binary file (2.55 kB). View file
 
src/interfaces/__pycache__/speaker_manager.cpython-310.pyc ADDED
Binary file (4.26 kB). View file
 
src/interfaces/__pycache__/transcriber.cpython-310.pyc ADDED
Binary file (2.55 kB). View file
 
src/interfaces/audio_processor.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio processing interface definitions
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Dict, Any, List, Tuple, Iterator, Optional
7
+ from dataclasses import dataclass
8
+
9
+
10
+ @dataclass
11
+ class AudioSegment:
12
+ """Audio segment representation"""
13
+ start: float
14
+ end: float
15
+ file_path: str
16
+ duration: float
17
+
18
+
19
+ class IAudioProcessor(ABC):
20
+ """Interface for audio processing operations"""
21
+
22
+ @abstractmethod
23
+ async def split_audio_by_silence(
24
+ self,
25
+ audio_path: str,
26
+ min_segment_length: float = 30.0,
27
+ min_silence_length: float = 1.0
28
+ ) -> List[AudioSegment]:
29
+ """Split audio file by silence detection"""
30
+ pass
31
+
32
+ @abstractmethod
33
+ async def process_audio_segment(
34
+ self,
35
+ segment: AudioSegment,
36
+ model_name: str = "turbo",
37
+ language: Optional[str] = None,
38
+ enable_speaker_diarization: bool = False
39
+ ) -> Dict[str, Any]:
40
+ """Process a single audio segment"""
41
+ pass
42
+
43
+ @abstractmethod
44
+ async def process_complete_audio(
45
+ self,
46
+ audio_path: str,
47
+ model_name: str = "turbo",
48
+ language: Optional[str] = None,
49
+ enable_speaker_diarization: bool = False,
50
+ min_segment_length: float = 30.0
51
+ ) -> Dict[str, Any]:
52
+ """Process complete audio file"""
53
+ pass
src/interfaces/audio_splitter.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio splitter interface definition
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Iterator, Tuple
7
+ from dataclasses import dataclass
8
+
9
+
10
+ @dataclass
11
+ class AudioSegment:
12
+ """Audio segment data class"""
13
+ start: float
14
+ end: float
15
+ duration: float
16
+
17
+ def __post_init__(self):
18
+ if self.duration <= 0:
19
+ self.duration = self.end - self.start
20
+
21
+
22
+ class IAudioSplitter(ABC):
23
+ """Interface for audio splitting"""
24
+
25
+ @abstractmethod
26
+ def split_audio(
27
+ self,
28
+ audio_path: str,
29
+ min_segment_length: float = 30.0,
30
+ min_silence_length: float = 1.0
31
+ ) -> Iterator[AudioSegment]:
32
+ """
33
+ Split audio into segments
34
+
35
+ Args:
36
+ audio_path: Path to audio file
37
+ min_segment_length: Minimum segment length in seconds
38
+ min_silence_length: Minimum silence length for splitting
39
+
40
+ Yields:
41
+ AudioSegment objects
42
+ """
43
+ pass
44
+
45
+ @abstractmethod
46
+ def get_audio_duration(self, audio_path: str) -> float:
47
+ """Get total duration of audio file"""
48
+ pass
src/interfaces/podcast_downloader.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Podcast downloading interface definitions
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Dict, Any, Optional, Tuple
7
+ from dataclasses import dataclass
8
+ from enum import Enum
9
+
10
+
11
+ class PodcastPlatform(Enum):
12
+ """Podcast platform enumeration"""
13
+ APPLE = "apple"
14
+ XIAOYUZHOU = "xyz"
15
+ SPOTIFY = "spotify"
16
+ GENERIC = "generic"
17
+
18
+
19
+ @dataclass
20
+ class PodcastInfo:
21
+ """Podcast episode information"""
22
+ title: str
23
+ audio_url: str
24
+ episode_id: str
25
+ platform: PodcastPlatform
26
+ duration: Optional[float] = None
27
+ description: Optional[str] = None
28
+
29
+
30
+ @dataclass
31
+ class DownloadResult:
32
+ """Download operation result"""
33
+ success: bool
34
+ file_path: Optional[str]
35
+ podcast_info: Optional[PodcastInfo]
36
+ error_message: Optional[str] = None
37
+
38
+
39
+ class IPodcastDownloader(ABC):
40
+ """Interface for podcast downloading operations"""
41
+
42
+ @abstractmethod
43
+ async def extract_podcast_info(self, url: str) -> PodcastInfo:
44
+ """Extract podcast information from URL"""
45
+ pass
46
+
47
+ @abstractmethod
48
+ async def download_podcast(
49
+ self,
50
+ url: str,
51
+ output_folder: str = "downloads",
52
+ convert_to_mp3: bool = False,
53
+ keep_original: bool = False
54
+ ) -> DownloadResult:
55
+ """Download podcast from URL"""
56
+ pass
57
+
58
+ @abstractmethod
59
+ def get_supported_platforms(self) -> list[PodcastPlatform]:
60
+ """Get list of supported platforms"""
61
+ pass
62
+
63
+ @abstractmethod
64
+ def can_handle_url(self, url: str) -> bool:
65
+ """Check if this downloader can handle the given URL"""
66
+ pass
src/interfaces/speaker_detector.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speaker detector interface definition
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Dict, List, Optional
7
+ from dataclasses import dataclass
8
+ import numpy as np
9
+
10
+
11
+ @dataclass
12
+ class SpeakerSegment:
13
+ """Speaker segment data class"""
14
+ start: float
15
+ end: float
16
+ speaker_id: str
17
+ confidence: Optional[float] = None
18
+
19
+
20
+ @dataclass
21
+ class SpeakerProfile:
22
+ """Speaker profile data class"""
23
+ speaker_id: str
24
+ embedding: np.ndarray
25
+ segments: List[SpeakerSegment]
26
+ total_duration: float
27
+
28
+
29
+ class ISpeakerDetector(ABC):
30
+ """Interface for speaker detection and diarization"""
31
+
32
+ @abstractmethod
33
+ async def detect_speakers(
34
+ self,
35
+ audio_file_path: str,
36
+ audio_segments: Optional[List] = None
37
+ ) -> Dict[str, SpeakerProfile]:
38
+ """
39
+ Detect and identify speakers in audio
40
+
41
+ Args:
42
+ audio_file_path: Path to audio file
43
+ audio_segments: Optional pre-segmented audio
44
+
45
+ Returns:
46
+ Dictionary mapping speaker IDs to SpeakerProfile objects
47
+ """
48
+ pass
49
+
50
+ @abstractmethod
51
+ def map_to_global_speakers(
52
+ self,
53
+ local_speakers: Dict[str, SpeakerProfile],
54
+ source_file: str
55
+ ) -> Dict[str, str]:
56
+ """
57
+ Map local speakers to global speaker identities
58
+
59
+ Args:
60
+ local_speakers: Local speaker profiles
61
+ source_file: Source audio file path
62
+
63
+ Returns:
64
+ Mapping from local speaker ID to global speaker ID
65
+ """
66
+ pass
67
+
68
+ @abstractmethod
69
+ def get_speaker_summary(self) -> Dict:
70
+ """Get summary of all detected speakers"""
71
+ pass