Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- src/__init__.py +13 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/__pycache__/app.cpython-310.pyc +0 -0
- src/adapters/__init__.py +13 -0
- src/adapters/__pycache__/__init__.cpython-310.pyc +0 -0
- src/adapters/__pycache__/local_adapter.cpython-310.pyc +0 -0
- src/adapters/__pycache__/modal_adapter.cpython-310.pyc +0 -0
- src/adapters/__pycache__/transcription_adapter_factory.cpython-310.pyc +0 -0
- src/adapters/local_adapter.py +93 -0
- src/adapters/modal_adapter.py +126 -0
- src/adapters/transcription_adapter_factory.py +77 -0
- src/api/__init__.py +5 -0
- src/api/__pycache__/__init__.cpython-310.pyc +0 -0
- src/api/__pycache__/transcription_api.cpython-310.pyc +0 -0
- src/api/transcription_api.py +112 -0
- src/app.py +169 -0
- src/config/__init__.py +5 -0
- src/config/__pycache__/__init__.cpython-310.pyc +0 -0
- src/config/__pycache__/config.cpython-310.pyc +0 -0
- src/config/__pycache__/modal_config.cpython-310.pyc +0 -0
- src/config/config.py +81 -0
- src/config/modal_config.py +210 -0
- src/core/__init__.py +29 -0
- src/core/__pycache__/__init__.cpython-310.pyc +0 -0
- src/core/__pycache__/audio_splitter.cpython-310.pyc +0 -0
- src/core/__pycache__/config.cpython-310.pyc +0 -0
- src/core/__pycache__/exceptions.cpython-310.pyc +0 -0
- src/core/__pycache__/speaker_diarization.cpython-310.pyc +0 -0
- src/core/__pycache__/whisper_transcriber.cpython-310.pyc +0 -0
- src/core/audio_splitter.py +90 -0
- src/core/config.py +150 -0
- src/core/exceptions.py +43 -0
- src/core/speaker_diarization.py +126 -0
- src/core/whisper_transcriber.py +113 -0
- src/deployment/__init__.py +8 -0
- src/deployment/deployment_manager.py +153 -0
- src/deployment/endpoint_manager.py +76 -0
- src/deployment/modal_deployer.py +97 -0
- src/interfaces/__init__.py +38 -0
- src/interfaces/__pycache__/__init__.cpython-310.pyc +0 -0
- src/interfaces/__pycache__/audio_processor.cpython-310.pyc +0 -0
- src/interfaces/__pycache__/audio_splitter.cpython-310.pyc +0 -0
- src/interfaces/__pycache__/podcast_downloader.cpython-310.pyc +0 -0
- src/interfaces/__pycache__/speaker_detector.cpython-310.pyc +0 -0
- src/interfaces/__pycache__/speaker_manager.cpython-310.pyc +0 -0
- src/interfaces/__pycache__/transcriber.cpython-310.pyc +0 -0
- src/interfaces/audio_processor.py +53 -0
- src/interfaces/audio_splitter.py +48 -0
- src/interfaces/podcast_downloader.py +66 -0
- src/interfaces/speaker_detector.py +71 -0
src/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
PodcastMcpGradio - Podcast Processing and Analysis Framework
|
3 |
+
|
4 |
+
A comprehensive framework for podcast downloading, transcription, and analysis
|
5 |
+
with MCP (Model Context Protocol) integration and Gradio UI.
|
6 |
+
"""
|
7 |
+
|
8 |
+
__version__ = "2.0.0"
|
9 |
+
__author__ = "PodcastMcpGradio Team"
|
10 |
+
__description__ = "Podcast Processing and Analysis Framework"
|
11 |
+
|
12 |
+
# Core modules will be imported as needed
|
13 |
+
__all__ = []
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (507 Bytes). View file
|
|
src/__pycache__/app.cpython-310.pyc
ADDED
Binary file (5.02 kB). View file
|
|
src/adapters/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Adapters for different transcription backends
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .transcription_adapter_factory import TranscriptionAdapterFactory
|
6 |
+
from .local_adapter import LocalTranscriptionAdapter
|
7 |
+
from .modal_adapter import ModalTranscriptionAdapter
|
8 |
+
|
9 |
+
__all__ = [
|
10 |
+
"TranscriptionAdapterFactory",
|
11 |
+
"LocalTranscriptionAdapter",
|
12 |
+
"ModalTranscriptionAdapter"
|
13 |
+
]
|
src/adapters/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (460 Bytes). View file
|
|
src/adapters/__pycache__/local_adapter.cpython-310.pyc
ADDED
Binary file (3.21 kB). View file
|
|
src/adapters/__pycache__/modal_adapter.cpython-310.pyc
ADDED
Binary file (3.77 kB). View file
|
|
src/adapters/__pycache__/transcription_adapter_factory.cpython-310.pyc
ADDED
Binary file (2.44 kB). View file
|
|
src/adapters/local_adapter.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Local transcription adapter for direct processing
|
3 |
+
"""
|
4 |
+
|
5 |
+
import asyncio
|
6 |
+
from typing import List, Optional
|
7 |
+
|
8 |
+
from ..interfaces.transcriber import ITranscriber, TranscriptionResult
|
9 |
+
from ..utils.config import AudioProcessingConfig
|
10 |
+
from ..utils.errors import TranscriptionError
|
11 |
+
|
12 |
+
|
13 |
+
class LocalTranscriptionAdapter(ITranscriber):
|
14 |
+
"""Adapter for local transcription processing"""
|
15 |
+
|
16 |
+
def __init__(self, config: Optional[AudioProcessingConfig] = None):
|
17 |
+
self.config = config or AudioProcessingConfig()
|
18 |
+
|
19 |
+
async def transcribe(
|
20 |
+
self,
|
21 |
+
audio_file_path: str,
|
22 |
+
model_size: str = "turbo",
|
23 |
+
language: Optional[str] = None,
|
24 |
+
enable_speaker_diarization: bool = False
|
25 |
+
) -> TranscriptionResult:
|
26 |
+
"""Transcribe audio using local processing"""
|
27 |
+
|
28 |
+
try:
|
29 |
+
# Use the new AudioProcessingService instead of old methods
|
30 |
+
from ..services.audio_processing_service import AudioProcessingService
|
31 |
+
from ..models.services import AudioProcessingRequest
|
32 |
+
|
33 |
+
print(f"🔄 Starting local transcription for: {audio_file_path}")
|
34 |
+
print(f"🚀 Running transcription with {model_size} model...")
|
35 |
+
|
36 |
+
# Create service and request
|
37 |
+
audio_service = AudioProcessingService()
|
38 |
+
request = AudioProcessingRequest(
|
39 |
+
audio_file_path=audio_file_path,
|
40 |
+
model_size=model_size,
|
41 |
+
language=language,
|
42 |
+
output_format="json",
|
43 |
+
enable_speaker_diarization=enable_speaker_diarization
|
44 |
+
)
|
45 |
+
|
46 |
+
# Process transcription
|
47 |
+
result = audio_service.transcribe_full_audio(request)
|
48 |
+
|
49 |
+
# Convert service result to adapter format
|
50 |
+
return self._convert_service_result(result)
|
51 |
+
|
52 |
+
except Exception as e:
|
53 |
+
raise TranscriptionError(
|
54 |
+
f"Local transcription failed: {str(e)}",
|
55 |
+
model=model_size,
|
56 |
+
audio_file=audio_file_path
|
57 |
+
)
|
58 |
+
|
59 |
+
def get_supported_models(self) -> List[str]:
|
60 |
+
"""Get list of supported model sizes"""
|
61 |
+
return list(self.config.whisper_models.keys())
|
62 |
+
|
63 |
+
def get_supported_languages(self) -> List[str]:
|
64 |
+
"""Get list of supported language codes"""
|
65 |
+
# This would normally come from Whisper's supported languages
|
66 |
+
return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
|
67 |
+
|
68 |
+
def _convert_service_result(self, service_result) -> TranscriptionResult:
|
69 |
+
"""Convert service result format to TranscriptionResult"""
|
70 |
+
from ..interfaces.transcriber import TranscriptionSegment
|
71 |
+
|
72 |
+
# Extract segments from service result if available
|
73 |
+
segments = []
|
74 |
+
if hasattr(service_result, 'segments') and service_result.segments:
|
75 |
+
for seg in service_result.segments:
|
76 |
+
segments.append(TranscriptionSegment(
|
77 |
+
start=getattr(seg, 'start', 0),
|
78 |
+
end=getattr(seg, 'end', 0),
|
79 |
+
text=getattr(seg, 'text', ''),
|
80 |
+
speaker=getattr(seg, 'speaker', None)
|
81 |
+
))
|
82 |
+
|
83 |
+
return TranscriptionResult(
|
84 |
+
text=getattr(service_result, 'text', ''),
|
85 |
+
segments=segments,
|
86 |
+
language=getattr(service_result, 'language_detected', 'unknown'),
|
87 |
+
model_used=getattr(service_result, 'model_used', 'unknown'),
|
88 |
+
audio_duration=getattr(service_result, 'audio_duration', 0),
|
89 |
+
processing_time=getattr(service_result, 'processing_time', 0),
|
90 |
+
speaker_diarization_enabled=getattr(service_result, 'speaker_diarization_enabled', False),
|
91 |
+
global_speaker_count=getattr(service_result, 'global_speaker_count', 0),
|
92 |
+
error_message=getattr(service_result, 'error_message', None)
|
93 |
+
)
|
src/adapters/modal_adapter.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modal transcription adapter for remote processing
|
3 |
+
"""
|
4 |
+
|
5 |
+
import requests
|
6 |
+
import base64
|
7 |
+
import pathlib
|
8 |
+
from typing import List, Optional
|
9 |
+
|
10 |
+
from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
|
11 |
+
from ..utils.config import AudioProcessingConfig
|
12 |
+
from ..utils.errors import TranscriptionError
|
13 |
+
|
14 |
+
|
15 |
+
class ModalTranscriptionAdapter(ITranscriber):
|
16 |
+
"""Adapter for Modal remote transcription processing"""
|
17 |
+
|
18 |
+
def __init__(self, config: Optional[AudioProcessingConfig] = None, endpoint_url: Optional[str] = None):
|
19 |
+
self.config = config or AudioProcessingConfig()
|
20 |
+
self.endpoint_url = endpoint_url
|
21 |
+
|
22 |
+
async def transcribe(
|
23 |
+
self,
|
24 |
+
audio_file_path: str,
|
25 |
+
model_size: str = "turbo",
|
26 |
+
language: Optional[str] = None,
|
27 |
+
enable_speaker_diarization: bool = False
|
28 |
+
) -> TranscriptionResult:
|
29 |
+
"""Transcribe audio using Modal endpoint"""
|
30 |
+
|
31 |
+
if not self.endpoint_url:
|
32 |
+
raise TranscriptionError(
|
33 |
+
"Modal endpoint URL not configured",
|
34 |
+
model=model_size,
|
35 |
+
audio_file=audio_file_path
|
36 |
+
)
|
37 |
+
|
38 |
+
try:
|
39 |
+
# Read and encode audio file
|
40 |
+
audio_path = pathlib.Path(audio_file_path)
|
41 |
+
if not audio_path.exists():
|
42 |
+
raise TranscriptionError(
|
43 |
+
f"Audio file not found: {audio_file_path}",
|
44 |
+
audio_file=audio_file_path
|
45 |
+
)
|
46 |
+
|
47 |
+
with open(audio_path, 'rb') as f:
|
48 |
+
audio_data = f.read()
|
49 |
+
|
50 |
+
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
51 |
+
|
52 |
+
# Prepare request data
|
53 |
+
request_data = {
|
54 |
+
"audio_file_data": audio_base64,
|
55 |
+
"audio_file_name": audio_path.name,
|
56 |
+
"model_size": model_size,
|
57 |
+
"language": language,
|
58 |
+
"output_format": "json",
|
59 |
+
"enable_speaker_diarization": enable_speaker_diarization
|
60 |
+
}
|
61 |
+
|
62 |
+
print(f"🔄 Sending transcription request to Modal endpoint")
|
63 |
+
print(f"📁 File: {audio_file_path} ({len(audio_data) / (1024*1024):.2f} MB)")
|
64 |
+
print(f"🔧 Model: {model_size}, Speaker diarization: {enable_speaker_diarization}")
|
65 |
+
|
66 |
+
# Make request to Modal endpoint
|
67 |
+
response = requests.post(
|
68 |
+
self.endpoint_url,
|
69 |
+
json=request_data,
|
70 |
+
timeout=1800 # 30 minutes timeout
|
71 |
+
)
|
72 |
+
|
73 |
+
response.raise_for_status()
|
74 |
+
result = response.json()
|
75 |
+
|
76 |
+
print(f"✅ Modal transcription completed")
|
77 |
+
|
78 |
+
# Convert result to TranscriptionResult format
|
79 |
+
return self._convert_modal_result(result)
|
80 |
+
|
81 |
+
except requests.exceptions.RequestException as e:
|
82 |
+
raise TranscriptionError(
|
83 |
+
f"Failed to call Modal endpoint: {str(e)}",
|
84 |
+
model=model_size,
|
85 |
+
audio_file=audio_file_path
|
86 |
+
)
|
87 |
+
except Exception as e:
|
88 |
+
raise TranscriptionError(
|
89 |
+
f"Modal transcription failed: {str(e)}",
|
90 |
+
model=model_size,
|
91 |
+
audio_file=audio_file_path
|
92 |
+
)
|
93 |
+
|
94 |
+
def get_supported_models(self) -> List[str]:
|
95 |
+
"""Get list of supported model sizes"""
|
96 |
+
return list(self.config.whisper_models.keys())
|
97 |
+
|
98 |
+
def get_supported_languages(self) -> List[str]:
|
99 |
+
"""Get list of supported language codes"""
|
100 |
+
return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
|
101 |
+
|
102 |
+
def _convert_modal_result(self, modal_result: dict) -> TranscriptionResult:
|
103 |
+
"""Convert Modal result format to TranscriptionResult"""
|
104 |
+
|
105 |
+
# Extract segments if available
|
106 |
+
segments = []
|
107 |
+
if "segments" in modal_result:
|
108 |
+
for seg in modal_result["segments"]:
|
109 |
+
segments.append(TranscriptionSegment(
|
110 |
+
start=seg.get("start", 0),
|
111 |
+
end=seg.get("end", 0),
|
112 |
+
text=seg.get("text", ""),
|
113 |
+
speaker=seg.get("speaker")
|
114 |
+
))
|
115 |
+
|
116 |
+
return TranscriptionResult(
|
117 |
+
text=modal_result.get("text", ""),
|
118 |
+
segments=segments,
|
119 |
+
language=modal_result.get("language_detected", "unknown"),
|
120 |
+
model_used=modal_result.get("model_used", "unknown"),
|
121 |
+
audio_duration=modal_result.get("audio_duration", 0),
|
122 |
+
processing_time=modal_result.get("processing_time", 0),
|
123 |
+
speaker_diarization_enabled=modal_result.get("speaker_diarization_enabled", False),
|
124 |
+
global_speaker_count=modal_result.get("global_speaker_count", 0),
|
125 |
+
error_message=modal_result.get("error_message")
|
126 |
+
)
|
src/adapters/transcription_adapter_factory.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Factory for creating transcription adapters
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
from ..interfaces.transcriber import ITranscriber
|
9 |
+
from ..utils.config import AudioProcessingConfig
|
10 |
+
from ..utils.errors import ConfigurationError
|
11 |
+
from .local_adapter import LocalTranscriptionAdapter
|
12 |
+
from .modal_adapter import ModalTranscriptionAdapter
|
13 |
+
|
14 |
+
|
15 |
+
class TranscriptionAdapterFactory:
|
16 |
+
"""Factory for creating appropriate transcription adapters"""
|
17 |
+
|
18 |
+
@staticmethod
|
19 |
+
def create_adapter(
|
20 |
+
deployment_mode: str = "auto",
|
21 |
+
config: Optional[AudioProcessingConfig] = None,
|
22 |
+
endpoint_url: Optional[str] = None
|
23 |
+
) -> ITranscriber:
|
24 |
+
"""
|
25 |
+
Create transcription adapter based on deployment mode
|
26 |
+
|
27 |
+
Args:
|
28 |
+
deployment_mode: "local", "modal", or "auto"
|
29 |
+
config: Configuration object
|
30 |
+
endpoint_url: Modal endpoint URL (for modal/auto mode)
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
ITranscriber: Appropriate transcription adapter
|
34 |
+
"""
|
35 |
+
|
36 |
+
config = config or AudioProcessingConfig()
|
37 |
+
|
38 |
+
# Auto mode: decide based on environment and endpoint availability
|
39 |
+
if deployment_mode == "auto":
|
40 |
+
if endpoint_url:
|
41 |
+
print(f"🌐 Auto mode: Using Modal adapter with endpoint {endpoint_url}")
|
42 |
+
return ModalTranscriptionAdapter(config=config, endpoint_url=endpoint_url)
|
43 |
+
else:
|
44 |
+
print(f"🏠 Auto mode: Using Local adapter (no endpoint configured)")
|
45 |
+
return LocalTranscriptionAdapter(config=config)
|
46 |
+
|
47 |
+
# Explicit local mode
|
48 |
+
elif deployment_mode == "local":
|
49 |
+
print(f"🏠 Using Local transcription adapter")
|
50 |
+
return LocalTranscriptionAdapter(config=config)
|
51 |
+
|
52 |
+
# Explicit modal mode
|
53 |
+
elif deployment_mode == "modal":
|
54 |
+
if not endpoint_url:
|
55 |
+
raise ConfigurationError(
|
56 |
+
"Modal endpoint URL is required for modal mode",
|
57 |
+
config_key="endpoint_url"
|
58 |
+
)
|
59 |
+
print(f"🌐 Using Modal transcription adapter with endpoint {endpoint_url}")
|
60 |
+
return ModalTranscriptionAdapter(config=config, endpoint_url=endpoint_url)
|
61 |
+
|
62 |
+
else:
|
63 |
+
raise ConfigurationError(
|
64 |
+
f"Unsupported deployment mode: {deployment_mode}. Use 'local', 'modal', or 'auto'",
|
65 |
+
config_key="deployment_mode"
|
66 |
+
)
|
67 |
+
|
68 |
+
@staticmethod
|
69 |
+
def _detect_deployment_mode() -> str:
|
70 |
+
"""Auto-detect deployment mode based on environment"""
|
71 |
+
import os
|
72 |
+
|
73 |
+
# Check if running in Modal environment
|
74 |
+
if os.environ.get("MODAL_TASK_ID"):
|
75 |
+
return "local" # We're inside Modal, use local processing
|
76 |
+
else:
|
77 |
+
return "modal" # We're outside Modal, use remote endpoint
|
src/api/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
API Module - External interfaces and endpoints
|
3 |
+
"""
|
4 |
+
|
5 |
+
__all__ = []
|
src/api/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (223 Bytes). View file
|
|
src/api/__pycache__/transcription_api.cpython-310.pyc
ADDED
Binary file (3.27 kB). View file
|
|
src/api/transcription_api.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Transcription API module
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
from typing import Optional, Dict, Any
|
7 |
+
|
8 |
+
from ..adapters import TranscriptionAdapterFactory
|
9 |
+
from ..services import TranscriptionService
|
10 |
+
from ..core import FFmpegAudioSplitter
|
11 |
+
from ..utils import AudioProcessingConfig, AudioProcessingError
|
12 |
+
|
13 |
+
|
14 |
+
class TranscriptionAPI:
|
15 |
+
"""High-level API for transcription operations"""
|
16 |
+
|
17 |
+
def __init__(self, config: Optional[AudioProcessingConfig] = None):
|
18 |
+
self.config = config or AudioProcessingConfig()
|
19 |
+
self.transcription_service = None
|
20 |
+
self._initialize_service()
|
21 |
+
|
22 |
+
def _initialize_service(self):
|
23 |
+
"""Initialize transcription service with appropriate adapter"""
|
24 |
+
try:
|
25 |
+
# Get endpoint URL from config file if available
|
26 |
+
endpoint_url = self._get_endpoint_url()
|
27 |
+
|
28 |
+
# Create appropriate adapter
|
29 |
+
transcriber = TranscriptionAdapterFactory.create_adapter(
|
30 |
+
deployment_mode="auto",
|
31 |
+
config=self.config,
|
32 |
+
endpoint_url=endpoint_url
|
33 |
+
)
|
34 |
+
|
35 |
+
# Create audio splitter
|
36 |
+
audio_splitter = FFmpegAudioSplitter()
|
37 |
+
|
38 |
+
# Create transcription service
|
39 |
+
self.transcription_service = TranscriptionService(
|
40 |
+
transcriber=transcriber,
|
41 |
+
audio_splitter=audio_splitter,
|
42 |
+
speaker_detector=None, # TODO: Add speaker detector when implemented
|
43 |
+
config=self.config
|
44 |
+
)
|
45 |
+
|
46 |
+
except Exception as e:
|
47 |
+
print(f"⚠️ Failed to initialize transcription service: {e}")
|
48 |
+
raise AudioProcessingError(f"Service initialization failed: {e}")
|
49 |
+
|
50 |
+
def _get_endpoint_url(self) -> Optional[str]:
|
51 |
+
"""Get Modal endpoint URL from configuration"""
|
52 |
+
try:
|
53 |
+
import json
|
54 |
+
config_file = "endpoint_config.json"
|
55 |
+
if os.path.exists(config_file):
|
56 |
+
with open(config_file, 'r') as f:
|
57 |
+
config = json.load(f)
|
58 |
+
return config.get("transcribe_audio")
|
59 |
+
except Exception:
|
60 |
+
pass
|
61 |
+
return None
|
62 |
+
|
63 |
+
async def transcribe_audio_file(
|
64 |
+
self,
|
65 |
+
audio_file_path: str,
|
66 |
+
model_size: str = "turbo",
|
67 |
+
language: Optional[str] = None,
|
68 |
+
output_format: str = "srt",
|
69 |
+
enable_speaker_diarization: bool = False
|
70 |
+
) -> Dict[str, Any]:
|
71 |
+
"""Transcribe audio file using the configured service"""
|
72 |
+
|
73 |
+
if not self.transcription_service:
|
74 |
+
raise AudioProcessingError("Transcription service not initialized")
|
75 |
+
|
76 |
+
return await self.transcription_service.transcribe_audio_file(
|
77 |
+
audio_file_path=audio_file_path,
|
78 |
+
model_size=model_size,
|
79 |
+
language=language,
|
80 |
+
output_format=output_format,
|
81 |
+
enable_speaker_diarization=enable_speaker_diarization
|
82 |
+
)
|
83 |
+
|
84 |
+
|
85 |
+
# Create global API instance
|
86 |
+
_api_instance = None
|
87 |
+
|
88 |
+
def get_transcription_api() -> TranscriptionAPI:
|
89 |
+
"""Get global transcription API instance"""
|
90 |
+
global _api_instance
|
91 |
+
if _api_instance is None:
|
92 |
+
_api_instance = TranscriptionAPI()
|
93 |
+
return _api_instance
|
94 |
+
|
95 |
+
async def transcribe_audio_adaptive_sync(
|
96 |
+
audio_file_path: str,
|
97 |
+
model_size: str = "turbo",
|
98 |
+
language: str = None,
|
99 |
+
output_format: str = "srt",
|
100 |
+
enable_speaker_diarization: bool = False
|
101 |
+
) -> Dict[str, Any]:
|
102 |
+
"""
|
103 |
+
Adaptive transcription function that routes to appropriate backend
|
104 |
+
"""
|
105 |
+
api = get_transcription_api()
|
106 |
+
return await api.transcribe_audio_file(
|
107 |
+
audio_file_path=audio_file_path,
|
108 |
+
model_size=model_size,
|
109 |
+
language=language,
|
110 |
+
output_format=output_format,
|
111 |
+
enable_speaker_diarization=enable_speaker_diarization
|
112 |
+
)
|
src/app.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# FastAPI + Gradio + FastMCP MCP server main entry point
|
2 |
+
|
3 |
+
import modal
|
4 |
+
from contextlib import asynccontextmanager
|
5 |
+
from fastapi import FastAPI
|
6 |
+
from gradio.routes import mount_gradio_app
|
7 |
+
import os
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import uvicorn
|
10 |
+
from mcp.server.fastmcp import FastMCP
|
11 |
+
|
12 |
+
# Import modules
|
13 |
+
from .tools import mcp_tools # Import the module, not get_mcp_server function
|
14 |
+
from .ui.gradio_ui import create_gradio_interface
|
15 |
+
from .config.config import is_modal_mode, is_local_mode
|
16 |
+
|
17 |
+
# Always import modal config since this module might be imported in modal context
|
18 |
+
try:
|
19 |
+
from .config.modal_config import app, image, volume, cache_dir, secrets
|
20 |
+
_modal_available = True
|
21 |
+
except ImportError:
|
22 |
+
_modal_available = False
|
23 |
+
|
24 |
+
# ==================== Application Creation Function ====================
|
25 |
+
|
26 |
+
def create_app():
|
27 |
+
"""Create and return complete Gradio + MCP application"""
|
28 |
+
|
29 |
+
print("🚀 Starting Gradio + FastMCP server")
|
30 |
+
|
31 |
+
# Create FastMCP server with new tools
|
32 |
+
mcp = FastMCP("Podcast MCP")
|
33 |
+
|
34 |
+
# Register tools using the new service architecture
|
35 |
+
@mcp.tool(description="Transcribe audio files to text using Whisper model with speaker diarization support")
|
36 |
+
async def transcribe_audio_file_tool(
|
37 |
+
audio_file_path: str,
|
38 |
+
model_size: str = "turbo",
|
39 |
+
language: str = None,
|
40 |
+
output_format: str = "srt",
|
41 |
+
enable_speaker_diarization: bool = False
|
42 |
+
):
|
43 |
+
return await mcp_tools.transcribe_audio_file(
|
44 |
+
audio_file_path, model_size, language, output_format, enable_speaker_diarization
|
45 |
+
)
|
46 |
+
|
47 |
+
@mcp.tool(description="Download Apple Podcast audio files")
|
48 |
+
async def download_apple_podcast_tool(url: str):
|
49 |
+
return await mcp_tools.download_apple_podcast(url)
|
50 |
+
|
51 |
+
@mcp.tool(description="Download XiaoYuZhou podcast audio files")
|
52 |
+
async def download_xyz_podcast_tool(url: str):
|
53 |
+
return await mcp_tools.download_xyz_podcast(url)
|
54 |
+
|
55 |
+
@mcp.tool(description="Scan directory for MP3 audio files")
|
56 |
+
async def get_mp3_files_tool(directory: str):
|
57 |
+
return await mcp_tools.get_mp3_files(directory)
|
58 |
+
|
59 |
+
@mcp.tool(description="Get basic file information")
|
60 |
+
async def get_file_info_tool(file_path: str):
|
61 |
+
return await mcp_tools.get_file_info(file_path)
|
62 |
+
|
63 |
+
@mcp.tool(description="Read text file content in segments")
|
64 |
+
async def read_text_file_segments_tool(
|
65 |
+
file_path: str,
|
66 |
+
chunk_size: int = 65536,
|
67 |
+
start_position: int = 0
|
68 |
+
):
|
69 |
+
return await mcp_tools.read_text_file_segments(file_path, chunk_size, start_position)
|
70 |
+
|
71 |
+
# Create FastAPI wrapper
|
72 |
+
fastapi_wrapper = FastAPI(
|
73 |
+
title="Modal AudioTranscriber MCP",
|
74 |
+
description="Gradio UI + FastMCP Tool + Modal Integration AudioTranscriber MCP",
|
75 |
+
version="1.0.0",
|
76 |
+
lifespan=lambda app: mcp.session_manager.run()
|
77 |
+
)
|
78 |
+
|
79 |
+
# Get FastMCP's streamable HTTP app
|
80 |
+
mcp_app = mcp.streamable_http_app()
|
81 |
+
|
82 |
+
# Mount FastMCP application to /api path
|
83 |
+
fastapi_wrapper.mount("/api", mcp_app)
|
84 |
+
|
85 |
+
# Create Gradio interface
|
86 |
+
ui_app = create_gradio_interface()
|
87 |
+
|
88 |
+
# Use Gradio's standard mounting approach
|
89 |
+
final_app = mount_gradio_app(
|
90 |
+
app=fastapi_wrapper,
|
91 |
+
blocks=ui_app,
|
92 |
+
path="/",
|
93 |
+
app_kwargs={
|
94 |
+
"docs_url": "/docs",
|
95 |
+
"redoc_url": "/redoc",
|
96 |
+
}
|
97 |
+
)
|
98 |
+
|
99 |
+
print("✅ Server startup completed")
|
100 |
+
print("🎨 Gradio UI: /")
|
101 |
+
print("🔧 MCP Streamable HTTP: /api/mcp")
|
102 |
+
print(f"📝 Server name: {mcp.name}")
|
103 |
+
|
104 |
+
return final_app
|
105 |
+
|
106 |
+
# ==================== Modal Deployment Configuration ====================
|
107 |
+
|
108 |
+
# Create a separate Modal app for the Gradio interface
|
109 |
+
if _modal_available:
|
110 |
+
gradio_mcp_app = modal.App(name="gradio-mcp-ui")
|
111 |
+
|
112 |
+
@gradio_mcp_app.function(
|
113 |
+
image=image,
|
114 |
+
cpu=2, # Adequate CPU for UI operations
|
115 |
+
memory=4096, # 4GB memory for stable UI performance
|
116 |
+
max_containers=5, # Reduced to control resource usage
|
117 |
+
min_containers=1, # Keep minimum containers for faster response
|
118 |
+
scaledown_window=600, # 20 minutes before scaling down
|
119 |
+
timeout=1800, # 30 minutes timeout to prevent preemption
|
120 |
+
volumes={cache_dir: volume},
|
121 |
+
secrets=secrets,
|
122 |
+
)
|
123 |
+
@modal.concurrent(max_inputs=100)
|
124 |
+
@modal.asgi_app()
|
125 |
+
def app_entry():
|
126 |
+
"""Modal deployment function - create and return complete Gradio + MCP application"""
|
127 |
+
return create_app()
|
128 |
+
|
129 |
+
# ==================== Main Entry Point ====================
|
130 |
+
|
131 |
+
def main():
|
132 |
+
"""Main entry point for all deployment modes"""
|
133 |
+
|
134 |
+
if is_modal_mode():
|
135 |
+
print("☁️ Modal mode: Use 'modal deploy src.app::gradio_mcp_app'")
|
136 |
+
return None
|
137 |
+
else:
|
138 |
+
print("🏠 Starting in local mode")
|
139 |
+
print("💡 GPU functions will be routed to Modal endpoints")
|
140 |
+
|
141 |
+
app = create_app()
|
142 |
+
return app
|
143 |
+
|
144 |
+
def run_local():
|
145 |
+
"""Run local server with uvicorn (for direct execution)"""
|
146 |
+
app = main()
|
147 |
+
if app:
|
148 |
+
uvicorn.run(
|
149 |
+
app,
|
150 |
+
host="0.0.0.0",
|
151 |
+
port=8000,
|
152 |
+
reload=False
|
153 |
+
)
|
154 |
+
|
155 |
+
# ==================== Hugging Face Spaces Support ====================
|
156 |
+
|
157 |
+
# For Hugging Face Spaces, directly create the app
|
158 |
+
def get_app():
|
159 |
+
"""Get app instance for HF Spaces"""
|
160 |
+
if "DEPLOYMENT_MODE" not in os.environ:
|
161 |
+
os.environ["DEPLOYMENT_MODE"] = "local"
|
162 |
+
return main()
|
163 |
+
|
164 |
+
# Create app for HF Spaces when imported
|
165 |
+
if __name__ != "__main__":
|
166 |
+
app = get_app()
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
run_local()
|
src/config/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Config Module - Configuration management
|
3 |
+
"""
|
4 |
+
|
5 |
+
__all__ = []
|
src/config/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (220 Bytes). View file
|
|
src/config/__pycache__/config.cpython-310.pyc
ADDED
Binary file (2.78 kB). View file
|
|
src/config/__pycache__/modal_config.cpython-310.pyc
ADDED
Binary file (4.98 kB). View file
|
|
src/config/config.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Deployment configuration for Gradio + MCP Server
|
3 |
+
Supports two deployment modes:
|
4 |
+
1. Local mode: Gradio runs locally, GPU functions call Modal endpoints
|
5 |
+
2. Modal mode: Gradio runs on Modal, GPU functions run locally on Modal
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
from enum import Enum
|
10 |
+
from typing import Optional
|
11 |
+
|
12 |
+
class DeploymentMode(Enum):
|
13 |
+
LOCAL = "local" # Local Gradio + Remote GPU (Modal endpoints)
|
14 |
+
MODAL = "modal" # Modal Gradio + Local GPU (Modal functions)
|
15 |
+
|
16 |
+
# Get deployment mode from environment variable
|
17 |
+
DEPLOYMENT_MODE = DeploymentMode(os.getenv("DEPLOYMENT_MODE", "local"))
|
18 |
+
|
19 |
+
# Modal endpoints configuration
|
20 |
+
MODAL_APP_NAME = "gradio-mcp-server"
|
21 |
+
|
22 |
+
# Endpoint URLs (will be set when deployed)
|
23 |
+
ENDPOINTS = {
|
24 |
+
"transcribe_audio": None, # Will be filled with actual endpoint URL
|
25 |
+
}
|
26 |
+
|
27 |
+
def get_deployment_mode() -> DeploymentMode:
|
28 |
+
"""Get current deployment mode"""
|
29 |
+
return DEPLOYMENT_MODE
|
30 |
+
|
31 |
+
def is_local_mode() -> bool:
|
32 |
+
"""Check if running in local mode"""
|
33 |
+
return DEPLOYMENT_MODE == DeploymentMode.LOCAL
|
34 |
+
|
35 |
+
def is_modal_mode() -> bool:
|
36 |
+
"""Check if running in modal mode"""
|
37 |
+
return DEPLOYMENT_MODE == DeploymentMode.MODAL
|
38 |
+
|
39 |
+
def set_endpoint_url(endpoint_name: str, url: str):
|
40 |
+
"""Set endpoint URL for local mode"""
|
41 |
+
global ENDPOINTS
|
42 |
+
ENDPOINTS[endpoint_name] = url
|
43 |
+
|
44 |
+
def get_endpoint_url(endpoint_name: str) -> Optional[str]:
|
45 |
+
"""Get endpoint URL for local mode"""
|
46 |
+
return ENDPOINTS.get(endpoint_name)
|
47 |
+
|
48 |
+
def get_transcribe_endpoint_url() -> Optional[str]:
|
49 |
+
"""Get transcription endpoint URL"""
|
50 |
+
return get_endpoint_url("transcribe_audio")
|
51 |
+
|
52 |
+
# Environment-specific cache directory
|
53 |
+
def get_cache_dir() -> str:
|
54 |
+
"""Get cache directory based on deployment mode"""
|
55 |
+
if is_modal_mode():
|
56 |
+
return "/root/cache"
|
57 |
+
else:
|
58 |
+
# Local mode - use user's home directory
|
59 |
+
home_dir = os.path.expanduser("~")
|
60 |
+
cache_dir = os.path.join(home_dir, ".gradio_mcp_cache")
|
61 |
+
os.makedirs(cache_dir, exist_ok=True)
|
62 |
+
return cache_dir
|
63 |
+
|
64 |
+
# Auto-load endpoint configuration in local mode
|
65 |
+
if is_local_mode():
|
66 |
+
import json
|
67 |
+
config_file = "endpoint_config.json"
|
68 |
+
if os.path.exists(config_file):
|
69 |
+
try:
|
70 |
+
with open(config_file, 'r') as f:
|
71 |
+
config = json.load(f)
|
72 |
+
for endpoint_name, url in config.items():
|
73 |
+
set_endpoint_url(endpoint_name, url)
|
74 |
+
print(f"✅ Loaded endpoint configuration from {config_file}")
|
75 |
+
except Exception as e:
|
76 |
+
print(f"⚠️ Failed to load endpoint configuration: {e}")
|
77 |
+
else:
|
78 |
+
print(f"⚠️ No endpoint configuration found. Run 'python deploy_endpoints.py deploy' first.")
|
79 |
+
|
80 |
+
print(f"🚀 Deployment mode: {DEPLOYMENT_MODE.value}")
|
81 |
+
print(f"📁 Cache directory: {get_cache_dir()}")
|
src/config/modal_config.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import modal
|
2 |
+
import os
|
3 |
+
|
4 |
+
# Create Modal application
|
5 |
+
app = modal.App(name="gradio-mcp-server")
|
6 |
+
|
7 |
+
# Try to get Hugging Face token from Modal secrets (required for speaker diarization)
|
8 |
+
try:
|
9 |
+
hf_secret = modal.Secret.from_name("huggingface-secret")
|
10 |
+
print("✅ Found Hugging Face secret configuration")
|
11 |
+
except Exception:
|
12 |
+
hf_secret = None
|
13 |
+
print("⚠️ Hugging Face secret not found, speaker diarization will be disabled")
|
14 |
+
|
15 |
+
# Create mounted volume
|
16 |
+
volume = modal.Volume.from_name("cache-volume", create_if_missing=True)
|
17 |
+
cache_dir = "/root/cache"
|
18 |
+
|
19 |
+
# Model preloading function
|
20 |
+
def download_models() -> None:
|
21 |
+
"""Download and cache Whisper and speaker diarization models"""
|
22 |
+
import whisper
|
23 |
+
import os
|
24 |
+
from pathlib import Path
|
25 |
+
|
26 |
+
# Create model cache directory
|
27 |
+
model_cache_dir = Path("/model")
|
28 |
+
model_cache_dir.mkdir(exist_ok=True)
|
29 |
+
|
30 |
+
print("📥 Downloading Whisper turbo model...")
|
31 |
+
# Download and cache Whisper turbo model
|
32 |
+
whisper_model = whisper.load_model("turbo", download_root="/model")
|
33 |
+
print("✅ Whisper turbo model downloaded and cached")
|
34 |
+
|
35 |
+
# Download speaker diarization models if HF token is available
|
36 |
+
if os.environ.get("HF_TOKEN"):
|
37 |
+
try:
|
38 |
+
print("📥 Downloading speaker diarization models...")
|
39 |
+
from pyannote.audio import Pipeline, Model
|
40 |
+
from pyannote.audio.core.inference import Inference
|
41 |
+
import torch
|
42 |
+
|
43 |
+
# Set proper cache directory for pyannote
|
44 |
+
os.environ["PYANNOTE_CACHE"] = "/model/speaker-diarization"
|
45 |
+
|
46 |
+
# Download and cache speaker diarization pipeline
|
47 |
+
# This will automatically cache to the PYANNOTE_CACHE directory
|
48 |
+
pipeline = Pipeline.from_pretrained(
|
49 |
+
"pyannote/speaker-diarization-3.1",
|
50 |
+
use_auth_token=os.environ["HF_TOKEN"],
|
51 |
+
cache_dir="/model/speaker-diarization"
|
52 |
+
)
|
53 |
+
|
54 |
+
# Preload speaker embedding model for speaker identification
|
55 |
+
print("📥 Downloading speaker embedding model...")
|
56 |
+
embedding_model = Model.from_pretrained(
|
57 |
+
"pyannote/embedding",
|
58 |
+
use_auth_token=os.environ["HF_TOKEN"],
|
59 |
+
cache_dir="/model/speaker-embedding"
|
60 |
+
)
|
61 |
+
|
62 |
+
# Set device for models
|
63 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
64 |
+
embedding_model.to(device)
|
65 |
+
embedding_model.eval()
|
66 |
+
|
67 |
+
# Create inference object for embedding extraction
|
68 |
+
inference = Inference(embedding_model, window="whole")
|
69 |
+
|
70 |
+
# Verify the pipeline works
|
71 |
+
print("🧪 Testing speaker diarization pipeline...")
|
72 |
+
|
73 |
+
# Create a simple marker file to indicate successful download
|
74 |
+
import json
|
75 |
+
speaker_dir = Path("/model/speaker-diarization")
|
76 |
+
speaker_dir.mkdir(exist_ok=True, parents=True)
|
77 |
+
|
78 |
+
embedding_dir = Path("/model/speaker-embedding")
|
79 |
+
embedding_dir.mkdir(exist_ok=True, parents=True)
|
80 |
+
|
81 |
+
config = {
|
82 |
+
"model_name": "pyannote/speaker-diarization-3.1",
|
83 |
+
"embedding_model_name": "pyannote/embedding",
|
84 |
+
"cached_at": str(speaker_dir),
|
85 |
+
"embedding_cached_at": str(embedding_dir),
|
86 |
+
"cache_complete": True,
|
87 |
+
"embedding_cache_complete": True,
|
88 |
+
"pyannote_cache_env": "/model/speaker-diarization",
|
89 |
+
"device": str(device)
|
90 |
+
}
|
91 |
+
with open(speaker_dir / "download_complete.json", "w") as f:
|
92 |
+
json.dump(config, f)
|
93 |
+
|
94 |
+
print("✅ Speaker diarization and embedding models downloaded and cached")
|
95 |
+
except Exception as e:
|
96 |
+
print(f"⚠️ Failed to download speaker diarization models: {e}")
|
97 |
+
else:
|
98 |
+
print("⚠️ No HF_TOKEN found, skipping speaker diarization model download")
|
99 |
+
|
100 |
+
# Create image environment with model preloading
|
101 |
+
image = modal.Image.debian_slim(python_version="3.11").apt_install(
|
102 |
+
# Basic tools
|
103 |
+
"ffmpeg",
|
104 |
+
"wget",
|
105 |
+
"curl",
|
106 |
+
"unzip",
|
107 |
+
"gnupg2",
|
108 |
+
"git", # Required by Whisper
|
109 |
+
# Chrome dependencies
|
110 |
+
"libglib2.0-0",
|
111 |
+
"libnss3",
|
112 |
+
"libatk-bridge2.0-0",
|
113 |
+
"libdrm2",
|
114 |
+
"libxkbcommon0",
|
115 |
+
"libxcomposite1",
|
116 |
+
"libxdamage1",
|
117 |
+
"libxrandr2",
|
118 |
+
"libgbm1",
|
119 |
+
"libxss1",
|
120 |
+
"libasound2"
|
121 |
+
).run_commands(
|
122 |
+
# Download and install Chrome directly (faster method)
|
123 |
+
"wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb",
|
124 |
+
"apt-get install -y ./google-chrome-stable_current_amd64.deb || apt-get install -y -f",
|
125 |
+
"rm google-chrome-stable_current_amd64.deb"
|
126 |
+
).pip_install(
|
127 |
+
# Web frameworks and basic libraries
|
128 |
+
"gradio>=5.31.0",
|
129 |
+
"fastapi",
|
130 |
+
"pydantic",
|
131 |
+
"python-dotenv",
|
132 |
+
# MCP related
|
133 |
+
"mcp[cli]",
|
134 |
+
"fastmcp>=2.7.0",
|
135 |
+
"starlette",
|
136 |
+
# Network and parsing
|
137 |
+
"beautifulsoup4",
|
138 |
+
"selenium",
|
139 |
+
"requests",
|
140 |
+
# Whisper and audio processing related
|
141 |
+
"git+https://github.com/openai/whisper.git",
|
142 |
+
"ffmpeg-python",
|
143 |
+
"torchaudio==2.1.0",
|
144 |
+
"numpy<2",
|
145 |
+
# Audio processing dependencies
|
146 |
+
"librosa",
|
147 |
+
"soundfile",
|
148 |
+
# Other Whisper ecosystem dependencies
|
149 |
+
"dacite",
|
150 |
+
"jiwer",
|
151 |
+
"pandas",
|
152 |
+
"loguru==0.6.0",
|
153 |
+
# GraphQL client (if needed)
|
154 |
+
"gql[all]~=3.0.0a5",
|
155 |
+
# Speaker diarization related dependencies
|
156 |
+
"pyannote.audio==3.1.0",
|
157 |
+
# System monitoring
|
158 |
+
"psutil",
|
159 |
+
).run_function(
|
160 |
+
download_models,
|
161 |
+
secrets=[hf_secret] if hf_secret else []
|
162 |
+
)
|
163 |
+
|
164 |
+
# Update file paths to reflect new structure
|
165 |
+
image = image.add_local_dir("../src", remote_path="/root/src")
|
166 |
+
secrets = [hf_secret] if hf_secret else []
|
167 |
+
|
168 |
+
# ==================== Modal Endpoints Configuration ====================
|
169 |
+
|
170 |
+
@app.function(
|
171 |
+
image=image,
|
172 |
+
volumes={cache_dir: volume},
|
173 |
+
cpu=4, # Increased CPU for better performance
|
174 |
+
memory=8192, # 8GB memory for stable transcription
|
175 |
+
gpu="A10G",
|
176 |
+
timeout=1800, # 30 minutes timeout for speaker diarization support
|
177 |
+
scaledown_window=40, # 15 minutes before scaling down
|
178 |
+
secrets=secrets,
|
179 |
+
)
|
180 |
+
@modal.fastapi_endpoint(method="POST", label="transcribe-audio-chunk-endpoint")
|
181 |
+
def transcribe_audio_chunk_endpoint(request_data: dict):
|
182 |
+
"""FastAPI endpoint for transcribing a single audio chunk (for distributed processing)"""
|
183 |
+
import sys
|
184 |
+
sys.path.append('/root')
|
185 |
+
|
186 |
+
from src.services.modal_transcription_service import ModalTranscriptionService
|
187 |
+
|
188 |
+
modal_service = ModalTranscriptionService(cache_dir="/root/cache", use_direct_modal_calls=True)
|
189 |
+
return modal_service.process_chunk_request(request_data)
|
190 |
+
|
191 |
+
@app.function(
|
192 |
+
image=image,
|
193 |
+
cpu=2, # Increased CPU for better health check performance
|
194 |
+
memory=2048, # 2GB memory for stability
|
195 |
+
timeout=300, # 5 minutes timeout for health checks
|
196 |
+
scaledown_window=600, # 10 minutes before scaling down
|
197 |
+
secrets=secrets,
|
198 |
+
)
|
199 |
+
@modal.fastapi_endpoint(method="GET", label="health-check-endpoint")
|
200 |
+
def health_check_endpoint():
|
201 |
+
"""Health check endpoint to verify service status"""
|
202 |
+
import sys
|
203 |
+
sys.path.append('/root')
|
204 |
+
|
205 |
+
from src.services.health_service import HealthService
|
206 |
+
|
207 |
+
health_service = HealthService()
|
208 |
+
return health_service.get_health_status()
|
209 |
+
|
210 |
+
|
src/core/__init__.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Core components for application and audio processing
|
3 |
+
"""
|
4 |
+
|
5 |
+
# Original core components
|
6 |
+
from .config import AppConfig, app_config, get_deployment_mode, is_local_mode, is_modal_mode
|
7 |
+
from .exceptions import AppError, ConfigError, ValidationError
|
8 |
+
|
9 |
+
# Audio processing core components
|
10 |
+
from .audio_splitter import FFmpegAudioSplitter
|
11 |
+
from .whisper_transcriber import WhisperTranscriber
|
12 |
+
from .speaker_diarization import PyannoteSpeikerDetector
|
13 |
+
|
14 |
+
__all__ = [
|
15 |
+
# Original core
|
16 |
+
"AppConfig",
|
17 |
+
"app_config",
|
18 |
+
"get_deployment_mode",
|
19 |
+
"is_local_mode",
|
20 |
+
"is_modal_mode",
|
21 |
+
"AppError",
|
22 |
+
"ConfigError",
|
23 |
+
"ValidationError",
|
24 |
+
|
25 |
+
# Audio processing core
|
26 |
+
"FFmpegAudioSplitter",
|
27 |
+
"WhisperTranscriber",
|
28 |
+
"PyannoteSpeikerDetector"
|
29 |
+
]
|
src/core/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (713 Bytes). View file
|
|
src/core/__pycache__/audio_splitter.cpython-310.pyc
ADDED
Binary file (2.32 kB). View file
|
|
src/core/__pycache__/config.cpython-310.pyc
ADDED
Binary file (5.14 kB). View file
|
|
src/core/__pycache__/exceptions.cpython-310.pyc
ADDED
Binary file (1.39 kB). View file
|
|
src/core/__pycache__/speaker_diarization.cpython-310.pyc
ADDED
Binary file (3.78 kB). View file
|
|
src/core/__pycache__/whisper_transcriber.cpython-310.pyc
ADDED
Binary file (3.41 kB). View file
|
|
src/core/audio_splitter.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Audio splitter implementation using FFmpeg
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
from typing import Iterator
|
7 |
+
import ffmpeg
|
8 |
+
|
9 |
+
from ..interfaces.audio_splitter import IAudioSplitter, AudioSegment
|
10 |
+
from ..utils.errors import AudioSplittingError
|
11 |
+
|
12 |
+
|
13 |
+
class FFmpegAudioSplitter(IAudioSplitter):
|
14 |
+
"""Audio splitter using FFmpeg's silence detection"""
|
15 |
+
|
16 |
+
def split_audio(
|
17 |
+
self,
|
18 |
+
audio_path: str,
|
19 |
+
min_segment_length: float = 30.0,
|
20 |
+
min_silence_length: float = 1.0
|
21 |
+
) -> Iterator[AudioSegment]:
|
22 |
+
"""Split audio by silence detection"""
|
23 |
+
|
24 |
+
try:
|
25 |
+
silence_end_re = re.compile(
|
26 |
+
r" silence_end: (?P<end>[0-9]+(\.?[0-9]*)) \| silence_duration: (?P<dur>[0-9]+(\.?[0-9]*))"
|
27 |
+
)
|
28 |
+
|
29 |
+
# Get audio duration
|
30 |
+
duration = self.get_audio_duration(audio_path)
|
31 |
+
|
32 |
+
# Use silence detection filter
|
33 |
+
reader = (
|
34 |
+
ffmpeg.input(str(audio_path))
|
35 |
+
.filter("silencedetect", n="-10dB", d=min_silence_length)
|
36 |
+
.output("pipe:", format="null")
|
37 |
+
.run_async(pipe_stderr=True)
|
38 |
+
)
|
39 |
+
|
40 |
+
cur_start = 0.0
|
41 |
+
segment_count = 0
|
42 |
+
|
43 |
+
while True:
|
44 |
+
line = reader.stderr.readline().decode("utf-8")
|
45 |
+
if not line:
|
46 |
+
break
|
47 |
+
|
48 |
+
match = silence_end_re.search(line)
|
49 |
+
if match:
|
50 |
+
silence_end, silence_dur = match.group("end"), match.group("dur")
|
51 |
+
split_at = float(silence_end) - (float(silence_dur) / 2)
|
52 |
+
|
53 |
+
if (split_at - cur_start) < min_segment_length:
|
54 |
+
continue
|
55 |
+
|
56 |
+
yield AudioSegment(
|
57 |
+
start=cur_start,
|
58 |
+
end=split_at,
|
59 |
+
duration=split_at - cur_start
|
60 |
+
)
|
61 |
+
cur_start = split_at
|
62 |
+
segment_count += 1
|
63 |
+
|
64 |
+
# Handle the last segment
|
65 |
+
if duration > cur_start:
|
66 |
+
yield AudioSegment(
|
67 |
+
start=cur_start,
|
68 |
+
end=duration,
|
69 |
+
duration=duration - cur_start
|
70 |
+
)
|
71 |
+
segment_count += 1
|
72 |
+
|
73 |
+
print(f"Audio split into {segment_count} segments")
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
raise AudioSplittingError(
|
77 |
+
f"Failed to split audio: {str(e)}",
|
78 |
+
audio_file=audio_path
|
79 |
+
)
|
80 |
+
|
81 |
+
def get_audio_duration(self, audio_path: str) -> float:
|
82 |
+
"""Get total duration of audio file"""
|
83 |
+
try:
|
84 |
+
metadata = ffmpeg.probe(audio_path)
|
85 |
+
return float(metadata["format"]["duration"])
|
86 |
+
except Exception as e:
|
87 |
+
raise AudioSplittingError(
|
88 |
+
f"Failed to get audio duration: {str(e)}",
|
89 |
+
audio_file=audio_path
|
90 |
+
)
|
src/core/config.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration management for PodcastMCP
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
from enum import Enum
|
8 |
+
from typing import Optional, Dict, Any
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
|
12 |
+
class DeploymentMode(Enum):
|
13 |
+
"""部署模式枚举"""
|
14 |
+
LOCAL = "local" # 本地Gradio + Modal GPU endpoints
|
15 |
+
MODAL = "modal" # 完全在Modal平台运行
|
16 |
+
HF_SPACES = "hf" # Hugging Face Spaces部署
|
17 |
+
|
18 |
+
|
19 |
+
class AppConfig:
|
20 |
+
"""应用配置管理器"""
|
21 |
+
|
22 |
+
def __init__(self):
|
23 |
+
self._deployment_mode = self._detect_deployment_mode()
|
24 |
+
self._cache_dir = self._get_cache_directory()
|
25 |
+
self._endpoints = self._load_endpoints()
|
26 |
+
|
27 |
+
@property
|
28 |
+
def deployment_mode(self) -> DeploymentMode:
|
29 |
+
"""获取当前部署模式"""
|
30 |
+
return self._deployment_mode
|
31 |
+
|
32 |
+
@property
|
33 |
+
def cache_dir(self) -> str:
|
34 |
+
"""获取缓存目录"""
|
35 |
+
return self._cache_dir
|
36 |
+
|
37 |
+
@property
|
38 |
+
def is_local_mode(self) -> bool:
|
39 |
+
"""是否为本地模式"""
|
40 |
+
return self._deployment_mode == DeploymentMode.LOCAL
|
41 |
+
|
42 |
+
@property
|
43 |
+
def is_modal_mode(self) -> bool:
|
44 |
+
"""是否为Modal模式"""
|
45 |
+
return self._deployment_mode == DeploymentMode.MODAL
|
46 |
+
|
47 |
+
@property
|
48 |
+
def is_hf_spaces_mode(self) -> bool:
|
49 |
+
"""是否为HF Spaces模式"""
|
50 |
+
return self._deployment_mode == DeploymentMode.HF_SPACES
|
51 |
+
|
52 |
+
def get_transcribe_endpoint_url(self) -> Optional[str]:
|
53 |
+
"""获取转录endpoint URL"""
|
54 |
+
return self._endpoints.get("transcribe_audio")
|
55 |
+
|
56 |
+
def set_endpoint_url(self, service: str, url: str):
|
57 |
+
"""设置endpoint URL"""
|
58 |
+
self._endpoints[service] = url
|
59 |
+
self._save_endpoints()
|
60 |
+
|
61 |
+
def _detect_deployment_mode(self) -> DeploymentMode:
|
62 |
+
"""自动检测部署模式"""
|
63 |
+
# 检查环境变量
|
64 |
+
mode = os.environ.get("DEPLOYMENT_MODE", "").lower()
|
65 |
+
if mode == "modal":
|
66 |
+
return DeploymentMode.MODAL
|
67 |
+
elif mode == "hf":
|
68 |
+
return DeploymentMode.HF_SPACES
|
69 |
+
|
70 |
+
# 检查是否在HF Spaces环境
|
71 |
+
if os.environ.get("SPACE_ID") or os.environ.get("SPACES_ZERO_GPU"):
|
72 |
+
return DeploymentMode.HF_SPACES
|
73 |
+
|
74 |
+
# 检查是否在Modal环境
|
75 |
+
if os.environ.get("MODAL_TASK_ID") or os.environ.get("MODAL_IS_INSIDE_CONTAINER"):
|
76 |
+
return DeploymentMode.MODAL
|
77 |
+
|
78 |
+
# 默认为本地模式
|
79 |
+
return DeploymentMode.LOCAL
|
80 |
+
|
81 |
+
def _get_cache_directory(self) -> str:
|
82 |
+
"""获取缓存目录路径"""
|
83 |
+
if self.is_modal_mode:
|
84 |
+
return "/root/cache"
|
85 |
+
else:
|
86 |
+
# 本地模式和HF Spaces使用用户缓存目录
|
87 |
+
home_dir = Path.home()
|
88 |
+
cache_dir = home_dir / ".gradio_mcp_cache"
|
89 |
+
cache_dir.mkdir(exist_ok=True)
|
90 |
+
return str(cache_dir)
|
91 |
+
|
92 |
+
def _load_endpoints(self) -> Dict[str, str]:
|
93 |
+
"""加载endpoint配置"""
|
94 |
+
config_file = Path("endpoint_config.json")
|
95 |
+
if config_file.exists():
|
96 |
+
try:
|
97 |
+
with open(config_file, 'r') as f:
|
98 |
+
endpoints = json.load(f)
|
99 |
+
print(f"✅ Loaded endpoint configuration from {config_file}")
|
100 |
+
return endpoints
|
101 |
+
except Exception as e:
|
102 |
+
print(f"⚠️ Failed to load endpoint config: {e}")
|
103 |
+
else:
|
104 |
+
print("⚠️ No endpoint configuration found. Run deployment first.")
|
105 |
+
|
106 |
+
return {}
|
107 |
+
|
108 |
+
def _save_endpoints(self):
|
109 |
+
"""保存endpoint配置"""
|
110 |
+
config_file = Path("endpoint_config.json")
|
111 |
+
try:
|
112 |
+
with open(config_file, 'w') as f:
|
113 |
+
json.dump(self._endpoints, f, indent=2)
|
114 |
+
print(f"💾 Endpoint configuration saved to {config_file}")
|
115 |
+
except Exception as e:
|
116 |
+
print(f"⚠️ Failed to save endpoint config: {e}")
|
117 |
+
|
118 |
+
|
119 |
+
# 全局配置实例
|
120 |
+
app_config = AppConfig()
|
121 |
+
|
122 |
+
# 向后兼容的函数接口
|
123 |
+
def get_deployment_mode() -> str:
|
124 |
+
"""获取部署模式字符串"""
|
125 |
+
return app_config.deployment_mode.value
|
126 |
+
|
127 |
+
def is_local_mode() -> bool:
|
128 |
+
"""是否为本地模式"""
|
129 |
+
return app_config.is_local_mode
|
130 |
+
|
131 |
+
def is_modal_mode() -> bool:
|
132 |
+
"""是否为Modal模式"""
|
133 |
+
return app_config.is_modal_mode
|
134 |
+
|
135 |
+
def get_cache_dir() -> str:
|
136 |
+
"""获取缓存目录"""
|
137 |
+
return app_config.cache_dir
|
138 |
+
|
139 |
+
def get_transcribe_endpoint_url() -> Optional[str]:
|
140 |
+
"""获取转录endpoint URL"""
|
141 |
+
return app_config.get_transcribe_endpoint_url()
|
142 |
+
|
143 |
+
def set_endpoint_url(service: str, url: str):
|
144 |
+
"""设置endpoint URL"""
|
145 |
+
app_config.set_endpoint_url(service, url)
|
146 |
+
|
147 |
+
|
148 |
+
# 打印配置信息
|
149 |
+
print(f"🚀 Deployment mode: {app_config.deployment_mode.value}")
|
150 |
+
print(f"📁 Cache directory: {app_config.cache_dir}")
|
src/core/exceptions.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Custom exceptions for PodcastMCP
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
class PodcastMCPError(Exception):
|
7 |
+
"""PodcastMCP基础异常类"""
|
8 |
+
pass
|
9 |
+
|
10 |
+
|
11 |
+
class AppError(PodcastMCPError):
|
12 |
+
"""应用程序异常"""
|
13 |
+
pass
|
14 |
+
|
15 |
+
|
16 |
+
class ConfigError(PodcastMCPError):
|
17 |
+
"""配置相关异常"""
|
18 |
+
pass
|
19 |
+
|
20 |
+
|
21 |
+
class ValidationError(PodcastMCPError):
|
22 |
+
"""验证相关异常"""
|
23 |
+
pass
|
24 |
+
|
25 |
+
|
26 |
+
class TranscriptionError(PodcastMCPError):
|
27 |
+
"""转录相关异常"""
|
28 |
+
pass
|
29 |
+
|
30 |
+
|
31 |
+
class DeploymentError(PodcastMCPError):
|
32 |
+
"""部署相关异常"""
|
33 |
+
pass
|
34 |
+
|
35 |
+
|
36 |
+
class FileNotFoundError(PodcastMCPError):
|
37 |
+
"""文件未找到异常"""
|
38 |
+
pass
|
39 |
+
|
40 |
+
|
41 |
+
class EndpointError(PodcastMCPError):
|
42 |
+
"""Endpoint相关异常"""
|
43 |
+
pass
|
src/core/speaker_diarization.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Speaker diarization implementation using pyannote.audio
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
+
from typing import Optional, List, Dict, Any
|
8 |
+
|
9 |
+
from ..interfaces.speaker_detector import ISpeakerDetector
|
10 |
+
from ..utils.config import AudioProcessingConfig
|
11 |
+
from ..utils.errors import SpeakerDiarizationError, ModelLoadError
|
12 |
+
|
13 |
+
|
14 |
+
class PyannoteSpeikerDetector(ISpeakerDetector):
|
15 |
+
"""Speaker diarization using pyannote.audio"""
|
16 |
+
|
17 |
+
def __init__(self, config: Optional[AudioProcessingConfig] = None):
|
18 |
+
self.config = config or AudioProcessingConfig()
|
19 |
+
self.device = self._setup_device()
|
20 |
+
self.pipeline = None
|
21 |
+
self.auth_token = os.environ.get(self.config.hf_token_env_var)
|
22 |
+
|
23 |
+
if not self.auth_token:
|
24 |
+
print("⚠️ No Hugging Face token found. Speaker diarization will be disabled.")
|
25 |
+
|
26 |
+
def _setup_device(self) -> torch.device:
|
27 |
+
"""Setup and return the best available device"""
|
28 |
+
if torch.cuda.is_available():
|
29 |
+
return torch.device("cuda")
|
30 |
+
else:
|
31 |
+
return torch.device("cpu")
|
32 |
+
|
33 |
+
async def detect_speakers(
|
34 |
+
self,
|
35 |
+
audio_file_path: str,
|
36 |
+
num_speakers: Optional[int] = None,
|
37 |
+
min_speakers: int = 1,
|
38 |
+
max_speakers: int = 10
|
39 |
+
) -> Dict[str, Any]:
|
40 |
+
"""Detect speakers in audio file"""
|
41 |
+
|
42 |
+
if not self.auth_token:
|
43 |
+
raise SpeakerDiarizationError(
|
44 |
+
"Speaker diarization requires Hugging Face token",
|
45 |
+
audio_file=audio_file_path
|
46 |
+
)
|
47 |
+
|
48 |
+
try:
|
49 |
+
# Load pipeline if not already loaded
|
50 |
+
if self.pipeline is None:
|
51 |
+
self.pipeline = self._load_pipeline()
|
52 |
+
|
53 |
+
# Perform diarization
|
54 |
+
diarization = self.pipeline(audio_file_path)
|
55 |
+
|
56 |
+
# Convert to our format
|
57 |
+
speakers = {}
|
58 |
+
segments = []
|
59 |
+
|
60 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
61 |
+
speaker_id = f"SPEAKER_{speaker.split('_')[-1].zfill(2)}"
|
62 |
+
segments.append({
|
63 |
+
"start": turn.start,
|
64 |
+
"end": turn.end,
|
65 |
+
"speaker": speaker_id
|
66 |
+
})
|
67 |
+
|
68 |
+
if speaker_id not in speakers:
|
69 |
+
speakers[speaker_id] = {
|
70 |
+
"id": speaker_id,
|
71 |
+
"total_time": 0.0,
|
72 |
+
"segments": []
|
73 |
+
}
|
74 |
+
|
75 |
+
speakers[speaker_id]["total_time"] += turn.end - turn.start
|
76 |
+
speakers[speaker_id]["segments"].append({
|
77 |
+
"start": turn.start,
|
78 |
+
"end": turn.end
|
79 |
+
})
|
80 |
+
|
81 |
+
return {
|
82 |
+
"speaker_count": len(speakers),
|
83 |
+
"speakers": speakers,
|
84 |
+
"segments": segments,
|
85 |
+
"audio_file": audio_file_path
|
86 |
+
}
|
87 |
+
|
88 |
+
except Exception as e:
|
89 |
+
raise SpeakerDiarizationError(
|
90 |
+
f"Speaker detection failed: {str(e)}",
|
91 |
+
audio_file=audio_file_path
|
92 |
+
)
|
93 |
+
|
94 |
+
def _load_pipeline(self):
|
95 |
+
"""Load pyannote speaker diarization pipeline"""
|
96 |
+
try:
|
97 |
+
# Suppress warnings
|
98 |
+
import warnings
|
99 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="pyannote")
|
100 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="pytorch_lightning")
|
101 |
+
warnings.filterwarnings("ignore", category=FutureWarning, module="pytorch_lightning")
|
102 |
+
|
103 |
+
from pyannote.audio import Pipeline
|
104 |
+
|
105 |
+
print("📥 Loading speaker diarization pipeline...")
|
106 |
+
pipeline = Pipeline.from_pretrained(
|
107 |
+
self.config.speaker_diarization_model,
|
108 |
+
use_auth_token=self.auth_token
|
109 |
+
)
|
110 |
+
pipeline.to(self.device)
|
111 |
+
|
112 |
+
return pipeline
|
113 |
+
|
114 |
+
except Exception as e:
|
115 |
+
raise ModelLoadError(
|
116 |
+
f"Failed to load speaker diarization pipeline: {str(e)}",
|
117 |
+
model_name=self.config.speaker_diarization_model
|
118 |
+
)
|
119 |
+
|
120 |
+
def get_supported_models(self) -> List[str]:
|
121 |
+
"""Get list of supported speaker diarization models"""
|
122 |
+
return [self.config.speaker_diarization_model]
|
123 |
+
|
124 |
+
def is_available(self) -> bool:
|
125 |
+
"""Check if speaker diarization is available"""
|
126 |
+
return self.auth_token is not None
|
src/core/whisper_transcriber.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Local Whisper transcriber implementation
|
3 |
+
"""
|
4 |
+
|
5 |
+
import whisper
|
6 |
+
import torch
|
7 |
+
import pathlib
|
8 |
+
import time
|
9 |
+
from typing import Optional, List
|
10 |
+
|
11 |
+
from ..interfaces.transcriber import ITranscriber, TranscriptionResult, TranscriptionSegment
|
12 |
+
from ..utils.config import AudioProcessingConfig
|
13 |
+
from ..utils.errors import TranscriptionError, ModelLoadError
|
14 |
+
|
15 |
+
|
16 |
+
class WhisperTranscriber(ITranscriber):
|
17 |
+
"""Local Whisper transcriber implementation"""
|
18 |
+
|
19 |
+
def __init__(self, config: Optional[AudioProcessingConfig] = None):
|
20 |
+
self.config = config or AudioProcessingConfig()
|
21 |
+
self.model_cache = {}
|
22 |
+
self.device = self._setup_device()
|
23 |
+
|
24 |
+
def _setup_device(self) -> str:
|
25 |
+
"""Setup and return the best available device"""
|
26 |
+
if torch.cuda.is_available():
|
27 |
+
return "cuda"
|
28 |
+
else:
|
29 |
+
return "cpu"
|
30 |
+
|
31 |
+
async def transcribe(
|
32 |
+
self,
|
33 |
+
audio_file_path: str,
|
34 |
+
model_size: str = "turbo",
|
35 |
+
language: Optional[str] = None,
|
36 |
+
enable_speaker_diarization: bool = False
|
37 |
+
) -> TranscriptionResult:
|
38 |
+
"""Transcribe audio using local Whisper model"""
|
39 |
+
|
40 |
+
try:
|
41 |
+
# Validate audio file
|
42 |
+
audio_path = pathlib.Path(audio_file_path)
|
43 |
+
if not audio_path.exists():
|
44 |
+
raise TranscriptionError(
|
45 |
+
f"Audio file not found: {audio_file_path}",
|
46 |
+
audio_file=audio_file_path
|
47 |
+
)
|
48 |
+
|
49 |
+
# Load model
|
50 |
+
model = self._load_model(model_size)
|
51 |
+
|
52 |
+
# Transcribe
|
53 |
+
start_time = time.time()
|
54 |
+
result = model.transcribe(
|
55 |
+
str(audio_path),
|
56 |
+
language=language,
|
57 |
+
verbose=False
|
58 |
+
)
|
59 |
+
processing_time = time.time() - start_time
|
60 |
+
|
61 |
+
# Convert to our format
|
62 |
+
segments = []
|
63 |
+
for seg in result.get("segments", []):
|
64 |
+
segments.append(TranscriptionSegment(
|
65 |
+
start=seg["start"],
|
66 |
+
end=seg["end"],
|
67 |
+
text=seg["text"].strip(),
|
68 |
+
confidence=seg.get("avg_logprob")
|
69 |
+
))
|
70 |
+
|
71 |
+
return TranscriptionResult(
|
72 |
+
text=result.get("text", "").strip(),
|
73 |
+
segments=segments,
|
74 |
+
language=result.get("language", "unknown"),
|
75 |
+
model_used=model_size,
|
76 |
+
audio_duration=result.get("duration", 0),
|
77 |
+
processing_time=processing_time,
|
78 |
+
speaker_diarization_enabled=enable_speaker_diarization,
|
79 |
+
global_speaker_count=0,
|
80 |
+
error_message=None
|
81 |
+
)
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
raise TranscriptionError(
|
85 |
+
f"Whisper transcription failed: {str(e)}",
|
86 |
+
model=model_size,
|
87 |
+
audio_file=audio_file_path
|
88 |
+
)
|
89 |
+
|
90 |
+
def _load_model(self, model_size: str):
|
91 |
+
"""Load Whisper model with caching"""
|
92 |
+
if model_size not in self.model_cache:
|
93 |
+
try:
|
94 |
+
print(f"📥 Loading Whisper model: {model_size}")
|
95 |
+
self.model_cache[model_size] = whisper.load_model(
|
96 |
+
model_size,
|
97 |
+
device=self.device
|
98 |
+
)
|
99 |
+
except Exception as e:
|
100 |
+
raise ModelLoadError(
|
101 |
+
f"Failed to load model {model_size}: {str(e)}",
|
102 |
+
model_name=model_size
|
103 |
+
)
|
104 |
+
|
105 |
+
return self.model_cache[model_size]
|
106 |
+
|
107 |
+
def get_supported_models(self) -> List[str]:
|
108 |
+
"""Get list of supported model sizes"""
|
109 |
+
return list(self.config.whisper_models.keys())
|
110 |
+
|
111 |
+
def get_supported_languages(self) -> List[str]:
|
112 |
+
"""Get list of supported language codes"""
|
113 |
+
return ["en", "zh", "ja", "ko", "es", "fr", "de", "ru", "auto"]
|
src/deployment/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Deployment management for audio processing services
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .modal_deployer import ModalDeployer
|
6 |
+
from .endpoint_manager import EndpointManager
|
7 |
+
|
8 |
+
__all__ = ["ModalDeployer", "EndpointManager"]
|
src/deployment/deployment_manager.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Simplified deployment manager
|
3 |
+
This replaces the complex deploy_endpoints.py with a cleaner interface
|
4 |
+
"""
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import sys
|
8 |
+
from typing import Optional
|
9 |
+
|
10 |
+
from ..audio_processing.deployment import ModalDeployer, EndpointManager
|
11 |
+
from ..audio_processing.utils.config import AudioProcessingConfig
|
12 |
+
from ..audio_processing.utils.errors import DeploymentError
|
13 |
+
|
14 |
+
|
15 |
+
class DeploymentManager:
|
16 |
+
"""Simplified deployment manager for audio processing services"""
|
17 |
+
|
18 |
+
def __init__(self):
|
19 |
+
self.config = AudioProcessingConfig()
|
20 |
+
self.modal_deployer = ModalDeployer(self.config)
|
21 |
+
self.endpoint_manager = EndpointManager()
|
22 |
+
|
23 |
+
def deploy(self) -> bool:
|
24 |
+
"""Deploy transcription service"""
|
25 |
+
try:
|
26 |
+
print("🚀 Starting deployment process...")
|
27 |
+
endpoint_url = self.modal_deployer.deploy_transcription_service()
|
28 |
+
|
29 |
+
if endpoint_url:
|
30 |
+
print(f"✅ Deployment successful!")
|
31 |
+
print(f"🌐 Endpoint URL: {endpoint_url}")
|
32 |
+
return True
|
33 |
+
else:
|
34 |
+
print("❌ Deployment failed: Could not get endpoint URL")
|
35 |
+
return False
|
36 |
+
|
37 |
+
except DeploymentError as e:
|
38 |
+
print(f"❌ Deployment failed: {e.message}")
|
39 |
+
if e.details:
|
40 |
+
print(f"📋 Details: {e.details}")
|
41 |
+
return False
|
42 |
+
except Exception as e:
|
43 |
+
print(f"❌ Unexpected deployment error: {str(e)}")
|
44 |
+
return False
|
45 |
+
|
46 |
+
def status(self) -> bool:
|
47 |
+
"""Check deployment status"""
|
48 |
+
print("🔍 Checking deployment status...")
|
49 |
+
|
50 |
+
endpoints = self.endpoint_manager.list_endpoints()
|
51 |
+
if not endpoints:
|
52 |
+
print("❌ No endpoints configured")
|
53 |
+
return False
|
54 |
+
|
55 |
+
print(f"📋 Configured endpoints:")
|
56 |
+
for name, url in endpoints.items():
|
57 |
+
print(f" • {name}: {url}")
|
58 |
+
|
59 |
+
# Check health
|
60 |
+
return self.modal_deployer.check_deployment_status()
|
61 |
+
|
62 |
+
def undeploy(self):
|
63 |
+
"""Remove deployment configuration"""
|
64 |
+
print("🗑️ Removing deployment configuration...")
|
65 |
+
self.modal_deployer.undeploy_transcription_service()
|
66 |
+
|
67 |
+
def list_endpoints(self):
|
68 |
+
"""List all configured endpoints"""
|
69 |
+
endpoints = self.endpoint_manager.list_endpoints()
|
70 |
+
|
71 |
+
if not endpoints:
|
72 |
+
print("📋 No endpoints configured")
|
73 |
+
return
|
74 |
+
|
75 |
+
print("📋 Configured endpoints:")
|
76 |
+
for name, url in endpoints.items():
|
77 |
+
health_status = "✅ Healthy" if self.endpoint_manager.check_endpoint_health(name) else "❌ Unhealthy"
|
78 |
+
print(f" • {name}: {url} ({health_status})")
|
79 |
+
|
80 |
+
def set_endpoint(self, name: str, url: str):
|
81 |
+
"""Manually set an endpoint"""
|
82 |
+
self.endpoint_manager.set_endpoint(name, url)
|
83 |
+
|
84 |
+
def remove_endpoint(self, name: str):
|
85 |
+
"""Remove an endpoint"""
|
86 |
+
self.endpoint_manager.remove_endpoint(name)
|
87 |
+
|
88 |
+
|
89 |
+
def main():
|
90 |
+
"""Command line interface for deployment manager"""
|
91 |
+
parser = argparse.ArgumentParser(description="Audio Processing Deployment Manager")
|
92 |
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
93 |
+
|
94 |
+
# Deploy command
|
95 |
+
subparsers.add_parser("deploy", help="Deploy transcription service to Modal")
|
96 |
+
|
97 |
+
# Status command
|
98 |
+
subparsers.add_parser("status", help="Check deployment status")
|
99 |
+
|
100 |
+
# Undeploy command
|
101 |
+
subparsers.add_parser("undeploy", help="Remove deployment configuration")
|
102 |
+
|
103 |
+
# List endpoints command
|
104 |
+
subparsers.add_parser("list", help="List all configured endpoints")
|
105 |
+
|
106 |
+
# Set endpoint command
|
107 |
+
set_parser = subparsers.add_parser("set", help="Set endpoint URL manually")
|
108 |
+
set_parser.add_argument("name", help="Endpoint name")
|
109 |
+
set_parser.add_argument("url", help="Endpoint URL")
|
110 |
+
|
111 |
+
# Remove endpoint command
|
112 |
+
remove_parser = subparsers.add_parser("remove", help="Remove endpoint")
|
113 |
+
remove_parser.add_argument("name", help="Endpoint name")
|
114 |
+
|
115 |
+
args = parser.parse_args()
|
116 |
+
|
117 |
+
if not args.command:
|
118 |
+
parser.print_help()
|
119 |
+
return
|
120 |
+
|
121 |
+
manager = DeploymentManager()
|
122 |
+
|
123 |
+
try:
|
124 |
+
if args.command == "deploy":
|
125 |
+
success = manager.deploy()
|
126 |
+
sys.exit(0 if success else 1)
|
127 |
+
|
128 |
+
elif args.command == "status":
|
129 |
+
success = manager.status()
|
130 |
+
sys.exit(0 if success else 1)
|
131 |
+
|
132 |
+
elif args.command == "undeploy":
|
133 |
+
manager.undeploy()
|
134 |
+
|
135 |
+
elif args.command == "list":
|
136 |
+
manager.list_endpoints()
|
137 |
+
|
138 |
+
elif args.command == "set":
|
139 |
+
manager.set_endpoint(args.name, args.url)
|
140 |
+
|
141 |
+
elif args.command == "remove":
|
142 |
+
manager.remove_endpoint(args.name)
|
143 |
+
|
144 |
+
except KeyboardInterrupt:
|
145 |
+
print("\n⚠️ Operation cancelled by user")
|
146 |
+
sys.exit(1)
|
147 |
+
except Exception as e:
|
148 |
+
print(f"❌ Error: {str(e)}")
|
149 |
+
sys.exit(1)
|
150 |
+
|
151 |
+
|
152 |
+
if __name__ == "__main__":
|
153 |
+
main()
|
src/deployment/endpoint_manager.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Endpoint manager for handling Modal endpoints
|
3 |
+
"""
|
4 |
+
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
from typing import Dict, Optional
|
8 |
+
|
9 |
+
from ..utils.errors import ConfigurationError
|
10 |
+
|
11 |
+
|
12 |
+
class EndpointManager:
|
13 |
+
"""Manager for Modal endpoint configuration"""
|
14 |
+
|
15 |
+
def __init__(self, config_file: str = "endpoint_config.json"):
|
16 |
+
self.config_file = config_file
|
17 |
+
self._endpoints = self._load_endpoints()
|
18 |
+
|
19 |
+
def _load_endpoints(self) -> Dict[str, str]:
|
20 |
+
"""Load endpoints from configuration file"""
|
21 |
+
if not os.path.exists(self.config_file):
|
22 |
+
return {}
|
23 |
+
|
24 |
+
try:
|
25 |
+
with open(self.config_file, 'r') as f:
|
26 |
+
return json.load(f)
|
27 |
+
except Exception as e:
|
28 |
+
print(f"⚠️ Failed to load endpoint configuration: {e}")
|
29 |
+
return {}
|
30 |
+
|
31 |
+
def save_endpoints(self):
|
32 |
+
"""Save endpoints to configuration file"""
|
33 |
+
try:
|
34 |
+
with open(self.config_file, 'w') as f:
|
35 |
+
json.dump(self._endpoints, f, indent=2)
|
36 |
+
print(f"💾 Endpoint configuration saved to {self.config_file}")
|
37 |
+
except Exception as e:
|
38 |
+
raise ConfigurationError(f"Failed to save endpoint configuration: {e}")
|
39 |
+
|
40 |
+
def set_endpoint(self, name: str, url: str):
|
41 |
+
"""Set endpoint URL"""
|
42 |
+
self._endpoints[name] = url
|
43 |
+
self.save_endpoints()
|
44 |
+
print(f"✅ Endpoint '{name}' set to: {url}")
|
45 |
+
|
46 |
+
def get_endpoint(self, name: str) -> Optional[str]:
|
47 |
+
"""Get endpoint URL"""
|
48 |
+
return self._endpoints.get(name)
|
49 |
+
|
50 |
+
def remove_endpoint(self, name: str):
|
51 |
+
"""Remove endpoint"""
|
52 |
+
if name in self._endpoints:
|
53 |
+
del self._endpoints[name]
|
54 |
+
self.save_endpoints()
|
55 |
+
print(f"🗑️ Endpoint '{name}' removed")
|
56 |
+
else:
|
57 |
+
print(f"⚠️ Endpoint '{name}' not found")
|
58 |
+
|
59 |
+
def list_endpoints(self) -> Dict[str, str]:
|
60 |
+
"""List all endpoints"""
|
61 |
+
return self._endpoints.copy()
|
62 |
+
|
63 |
+
def check_endpoint_health(self, name: str) -> bool:
|
64 |
+
"""Check if endpoint is healthy"""
|
65 |
+
url = self.get_endpoint(name)
|
66 |
+
if not url:
|
67 |
+
return False
|
68 |
+
|
69 |
+
try:
|
70 |
+
import requests
|
71 |
+
# Try a simple health check (adjust based on your endpoint)
|
72 |
+
health_url = url.replace("/transcribe", "/health")
|
73 |
+
response = requests.get(health_url, timeout=10)
|
74 |
+
return response.status_code == 200
|
75 |
+
except Exception:
|
76 |
+
return False
|
src/deployment/modal_deployer.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modal deployer for deploying transcription services
|
3 |
+
"""
|
4 |
+
|
5 |
+
import subprocess
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
from ..utils.config import AudioProcessingConfig
|
9 |
+
from ..utils.errors import DeploymentError
|
10 |
+
from .endpoint_manager import EndpointManager
|
11 |
+
|
12 |
+
|
13 |
+
class ModalDeployer:
|
14 |
+
"""Deployer for Modal transcription services"""
|
15 |
+
|
16 |
+
def __init__(self, config: Optional[AudioProcessingConfig] = None):
|
17 |
+
self.config = config or AudioProcessingConfig()
|
18 |
+
self.endpoint_manager = EndpointManager()
|
19 |
+
|
20 |
+
def deploy_transcription_service(self) -> Optional[str]:
|
21 |
+
"""Deploy transcription service to Modal"""
|
22 |
+
|
23 |
+
print("🚀 Deploying transcription service to Modal...")
|
24 |
+
|
25 |
+
try:
|
26 |
+
# Deploy the Modal app
|
27 |
+
print("🚀 Running modal deploy command...")
|
28 |
+
result = subprocess.run(
|
29 |
+
["modal", "deploy", "modal_config.py"],
|
30 |
+
capture_output=True,
|
31 |
+
text=True
|
32 |
+
)
|
33 |
+
|
34 |
+
if result.returncode == 0:
|
35 |
+
# Extract or construct endpoint URL
|
36 |
+
endpoint_url = self._extract_endpoint_url(result.stdout)
|
37 |
+
|
38 |
+
if endpoint_url:
|
39 |
+
# Save endpoint configuration
|
40 |
+
self.endpoint_manager.set_endpoint("transcribe_audio", endpoint_url)
|
41 |
+
print(f"✅ Transcription service deployed: {endpoint_url}")
|
42 |
+
return endpoint_url
|
43 |
+
else:
|
44 |
+
print("⚠️ Could not extract endpoint URL from deployment output")
|
45 |
+
return None
|
46 |
+
else:
|
47 |
+
raise DeploymentError(
|
48 |
+
f"Modal deployment failed: {result.stderr}",
|
49 |
+
service="transcription"
|
50 |
+
)
|
51 |
+
|
52 |
+
except FileNotFoundError:
|
53 |
+
raise DeploymentError(
|
54 |
+
"Modal CLI not found. Please install Modal: pip install modal",
|
55 |
+
service="transcription"
|
56 |
+
)
|
57 |
+
except Exception as e:
|
58 |
+
raise DeploymentError(
|
59 |
+
f"Failed to deploy transcription service: {str(e)}",
|
60 |
+
service="transcription"
|
61 |
+
)
|
62 |
+
|
63 |
+
def _extract_endpoint_url(self, output: str) -> Optional[str]:
|
64 |
+
"""Extract endpoint URL from deployment output"""
|
65 |
+
|
66 |
+
# Look for URL in output
|
67 |
+
for line in output.split('\n'):
|
68 |
+
if 'https://' in line and 'modal.run' in line:
|
69 |
+
# Extract URL from line
|
70 |
+
parts = line.split()
|
71 |
+
for part in parts:
|
72 |
+
if part.startswith('https://') and 'modal.run' in part:
|
73 |
+
return part
|
74 |
+
|
75 |
+
# Fallback to constructed URL
|
76 |
+
return f"https://{self.config.modal_app_name}--transcribe-audio-endpoint.modal.run"
|
77 |
+
|
78 |
+
def check_deployment_status(self) -> bool:
|
79 |
+
"""Check if transcription service is deployed and healthy"""
|
80 |
+
|
81 |
+
endpoint_url = self.endpoint_manager.get_endpoint("transcribe_audio")
|
82 |
+
if not endpoint_url:
|
83 |
+
print("❌ No transcription endpoint configured")
|
84 |
+
return False
|
85 |
+
|
86 |
+
if self.endpoint_manager.check_endpoint_health("transcribe_audio"):
|
87 |
+
print(f"✅ Transcription service is healthy: {endpoint_url}")
|
88 |
+
return True
|
89 |
+
else:
|
90 |
+
print(f"❌ Transcription service is not responding: {endpoint_url}")
|
91 |
+
return False
|
92 |
+
|
93 |
+
def undeploy_transcription_service(self):
|
94 |
+
"""Remove transcription service endpoint"""
|
95 |
+
self.endpoint_manager.remove_endpoint("transcribe_audio")
|
96 |
+
print("🗑️ Transcription service endpoint removed from configuration")
|
97 |
+
print("💡 Note: The actual Modal deployment may still be active. Use 'modal app stop' to stop it.")
|
src/interfaces/__init__.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Interfaces for audio processing components
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .transcriber import ITranscriber
|
6 |
+
from .speaker_detector import ISpeakerDetector
|
7 |
+
from .audio_splitter import IAudioSplitter
|
8 |
+
from .audio_processor import IAudioProcessor, AudioSegment
|
9 |
+
from .podcast_downloader import IPodcastDownloader, PodcastInfo, DownloadResult, PodcastPlatform
|
10 |
+
from .speaker_manager import (
|
11 |
+
ISpeakerEmbeddingManager,
|
12 |
+
ISpeakerIdentificationService,
|
13 |
+
SpeakerEmbedding,
|
14 |
+
SpeakerSegment
|
15 |
+
)
|
16 |
+
|
17 |
+
__all__ = [
|
18 |
+
# Core interfaces
|
19 |
+
"ITranscriber",
|
20 |
+
"ISpeakerDetector",
|
21 |
+
"IAudioSplitter",
|
22 |
+
|
23 |
+
# New service interfaces
|
24 |
+
"IAudioProcessor",
|
25 |
+
"IPodcastDownloader",
|
26 |
+
"ISpeakerEmbeddingManager",
|
27 |
+
"ISpeakerIdentificationService",
|
28 |
+
|
29 |
+
# Data classes
|
30 |
+
"AudioSegment",
|
31 |
+
"PodcastInfo",
|
32 |
+
"DownloadResult",
|
33 |
+
"SpeakerEmbedding",
|
34 |
+
"SpeakerSegment",
|
35 |
+
|
36 |
+
# Enums
|
37 |
+
"PodcastPlatform"
|
38 |
+
]
|
src/interfaces/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (828 Bytes). View file
|
|
src/interfaces/__pycache__/audio_processor.cpython-310.pyc
ADDED
Binary file (2.01 kB). View file
|
|
src/interfaces/__pycache__/audio_splitter.cpython-310.pyc
ADDED
Binary file (1.84 kB). View file
|
|
src/interfaces/__pycache__/podcast_downloader.cpython-310.pyc
ADDED
Binary file (2.63 kB). View file
|
|
src/interfaces/__pycache__/speaker_detector.cpython-310.pyc
ADDED
Binary file (2.55 kB). View file
|
|
src/interfaces/__pycache__/speaker_manager.cpython-310.pyc
ADDED
Binary file (4.26 kB). View file
|
|
src/interfaces/__pycache__/transcriber.cpython-310.pyc
ADDED
Binary file (2.55 kB). View file
|
|
src/interfaces/audio_processor.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Audio processing interface definitions
|
3 |
+
"""
|
4 |
+
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import Dict, Any, List, Tuple, Iterator, Optional
|
7 |
+
from dataclasses import dataclass
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class AudioSegment:
|
12 |
+
"""Audio segment representation"""
|
13 |
+
start: float
|
14 |
+
end: float
|
15 |
+
file_path: str
|
16 |
+
duration: float
|
17 |
+
|
18 |
+
|
19 |
+
class IAudioProcessor(ABC):
|
20 |
+
"""Interface for audio processing operations"""
|
21 |
+
|
22 |
+
@abstractmethod
|
23 |
+
async def split_audio_by_silence(
|
24 |
+
self,
|
25 |
+
audio_path: str,
|
26 |
+
min_segment_length: float = 30.0,
|
27 |
+
min_silence_length: float = 1.0
|
28 |
+
) -> List[AudioSegment]:
|
29 |
+
"""Split audio file by silence detection"""
|
30 |
+
pass
|
31 |
+
|
32 |
+
@abstractmethod
|
33 |
+
async def process_audio_segment(
|
34 |
+
self,
|
35 |
+
segment: AudioSegment,
|
36 |
+
model_name: str = "turbo",
|
37 |
+
language: Optional[str] = None,
|
38 |
+
enable_speaker_diarization: bool = False
|
39 |
+
) -> Dict[str, Any]:
|
40 |
+
"""Process a single audio segment"""
|
41 |
+
pass
|
42 |
+
|
43 |
+
@abstractmethod
|
44 |
+
async def process_complete_audio(
|
45 |
+
self,
|
46 |
+
audio_path: str,
|
47 |
+
model_name: str = "turbo",
|
48 |
+
language: Optional[str] = None,
|
49 |
+
enable_speaker_diarization: bool = False,
|
50 |
+
min_segment_length: float = 30.0
|
51 |
+
) -> Dict[str, Any]:
|
52 |
+
"""Process complete audio file"""
|
53 |
+
pass
|
src/interfaces/audio_splitter.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Audio splitter interface definition
|
3 |
+
"""
|
4 |
+
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import Iterator, Tuple
|
7 |
+
from dataclasses import dataclass
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class AudioSegment:
|
12 |
+
"""Audio segment data class"""
|
13 |
+
start: float
|
14 |
+
end: float
|
15 |
+
duration: float
|
16 |
+
|
17 |
+
def __post_init__(self):
|
18 |
+
if self.duration <= 0:
|
19 |
+
self.duration = self.end - self.start
|
20 |
+
|
21 |
+
|
22 |
+
class IAudioSplitter(ABC):
|
23 |
+
"""Interface for audio splitting"""
|
24 |
+
|
25 |
+
@abstractmethod
|
26 |
+
def split_audio(
|
27 |
+
self,
|
28 |
+
audio_path: str,
|
29 |
+
min_segment_length: float = 30.0,
|
30 |
+
min_silence_length: float = 1.0
|
31 |
+
) -> Iterator[AudioSegment]:
|
32 |
+
"""
|
33 |
+
Split audio into segments
|
34 |
+
|
35 |
+
Args:
|
36 |
+
audio_path: Path to audio file
|
37 |
+
min_segment_length: Minimum segment length in seconds
|
38 |
+
min_silence_length: Minimum silence length for splitting
|
39 |
+
|
40 |
+
Yields:
|
41 |
+
AudioSegment objects
|
42 |
+
"""
|
43 |
+
pass
|
44 |
+
|
45 |
+
@abstractmethod
|
46 |
+
def get_audio_duration(self, audio_path: str) -> float:
|
47 |
+
"""Get total duration of audio file"""
|
48 |
+
pass
|
src/interfaces/podcast_downloader.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Podcast downloading interface definitions
|
3 |
+
"""
|
4 |
+
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import Dict, Any, Optional, Tuple
|
7 |
+
from dataclasses import dataclass
|
8 |
+
from enum import Enum
|
9 |
+
|
10 |
+
|
11 |
+
class PodcastPlatform(Enum):
|
12 |
+
"""Podcast platform enumeration"""
|
13 |
+
APPLE = "apple"
|
14 |
+
XIAOYUZHOU = "xyz"
|
15 |
+
SPOTIFY = "spotify"
|
16 |
+
GENERIC = "generic"
|
17 |
+
|
18 |
+
|
19 |
+
@dataclass
|
20 |
+
class PodcastInfo:
|
21 |
+
"""Podcast episode information"""
|
22 |
+
title: str
|
23 |
+
audio_url: str
|
24 |
+
episode_id: str
|
25 |
+
platform: PodcastPlatform
|
26 |
+
duration: Optional[float] = None
|
27 |
+
description: Optional[str] = None
|
28 |
+
|
29 |
+
|
30 |
+
@dataclass
|
31 |
+
class DownloadResult:
|
32 |
+
"""Download operation result"""
|
33 |
+
success: bool
|
34 |
+
file_path: Optional[str]
|
35 |
+
podcast_info: Optional[PodcastInfo]
|
36 |
+
error_message: Optional[str] = None
|
37 |
+
|
38 |
+
|
39 |
+
class IPodcastDownloader(ABC):
|
40 |
+
"""Interface for podcast downloading operations"""
|
41 |
+
|
42 |
+
@abstractmethod
|
43 |
+
async def extract_podcast_info(self, url: str) -> PodcastInfo:
|
44 |
+
"""Extract podcast information from URL"""
|
45 |
+
pass
|
46 |
+
|
47 |
+
@abstractmethod
|
48 |
+
async def download_podcast(
|
49 |
+
self,
|
50 |
+
url: str,
|
51 |
+
output_folder: str = "downloads",
|
52 |
+
convert_to_mp3: bool = False,
|
53 |
+
keep_original: bool = False
|
54 |
+
) -> DownloadResult:
|
55 |
+
"""Download podcast from URL"""
|
56 |
+
pass
|
57 |
+
|
58 |
+
@abstractmethod
|
59 |
+
def get_supported_platforms(self) -> list[PodcastPlatform]:
|
60 |
+
"""Get list of supported platforms"""
|
61 |
+
pass
|
62 |
+
|
63 |
+
@abstractmethod
|
64 |
+
def can_handle_url(self, url: str) -> bool:
|
65 |
+
"""Check if this downloader can handle the given URL"""
|
66 |
+
pass
|
src/interfaces/speaker_detector.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Speaker detector interface definition
|
3 |
+
"""
|
4 |
+
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import Dict, List, Optional
|
7 |
+
from dataclasses import dataclass
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class SpeakerSegment:
|
13 |
+
"""Speaker segment data class"""
|
14 |
+
start: float
|
15 |
+
end: float
|
16 |
+
speaker_id: str
|
17 |
+
confidence: Optional[float] = None
|
18 |
+
|
19 |
+
|
20 |
+
@dataclass
|
21 |
+
class SpeakerProfile:
|
22 |
+
"""Speaker profile data class"""
|
23 |
+
speaker_id: str
|
24 |
+
embedding: np.ndarray
|
25 |
+
segments: List[SpeakerSegment]
|
26 |
+
total_duration: float
|
27 |
+
|
28 |
+
|
29 |
+
class ISpeakerDetector(ABC):
|
30 |
+
"""Interface for speaker detection and diarization"""
|
31 |
+
|
32 |
+
@abstractmethod
|
33 |
+
async def detect_speakers(
|
34 |
+
self,
|
35 |
+
audio_file_path: str,
|
36 |
+
audio_segments: Optional[List] = None
|
37 |
+
) -> Dict[str, SpeakerProfile]:
|
38 |
+
"""
|
39 |
+
Detect and identify speakers in audio
|
40 |
+
|
41 |
+
Args:
|
42 |
+
audio_file_path: Path to audio file
|
43 |
+
audio_segments: Optional pre-segmented audio
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
Dictionary mapping speaker IDs to SpeakerProfile objects
|
47 |
+
"""
|
48 |
+
pass
|
49 |
+
|
50 |
+
@abstractmethod
|
51 |
+
def map_to_global_speakers(
|
52 |
+
self,
|
53 |
+
local_speakers: Dict[str, SpeakerProfile],
|
54 |
+
source_file: str
|
55 |
+
) -> Dict[str, str]:
|
56 |
+
"""
|
57 |
+
Map local speakers to global speaker identities
|
58 |
+
|
59 |
+
Args:
|
60 |
+
local_speakers: Local speaker profiles
|
61 |
+
source_file: Source audio file path
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
Mapping from local speaker ID to global speaker ID
|
65 |
+
"""
|
66 |
+
pass
|
67 |
+
|
68 |
+
@abstractmethod
|
69 |
+
def get_speaker_summary(self) -> Dict:
|
70 |
+
"""Get summary of all detected speakers"""
|
71 |
+
pass
|