ModalTranscriberMCP / src /services /audio_processing_service.py
richard-su's picture
Upload folder using huggingface_hub
b5df735 verified
"""
Audio Processing Service - integrates audio segmentation and transcription
"""
import re
import asyncio
import pathlib
import tempfile
from typing import Dict, Any, List, Optional
import ffmpeg
from ..interfaces.audio_processor import IAudioProcessor, AudioSegment
from ..interfaces.transcriber import ITranscriber
from ..interfaces.speaker_manager import ISpeakerIdentificationService
from ..utils.config import AudioProcessingConfig
from ..utils.errors import AudioProcessingError
from ..models.transcription import TranscriptionResponse, TranscriptionSegment
class AudioProcessingService(IAudioProcessor):
"""High-level audio processing service that coordinates transcription and speaker identification"""
def __init__(
self,
transcriber: ITranscriber,
speaker_service: Optional[ISpeakerIdentificationService] = None,
config: Optional[AudioProcessingConfig] = None
):
self.transcriber = transcriber
self.speaker_service = speaker_service
self.config = config or AudioProcessingConfig()
async def split_audio_by_silence(
self,
audio_path: str,
min_segment_length: float = 30.0,
min_silence_length: float = 1.0
) -> List[AudioSegment]:
"""
Intelligently split audio using FFmpeg's silencedetect filter
"""
try:
silence_end_re = re.compile(
r" silence_end: (?P<end>[0-9]+(\.?[0-9]*)) \| silence_duration: (?P<dur>[0-9]+(\.?[0-9]*))"
)
# Get audio duration
metadata = ffmpeg.probe(audio_path)
duration = float(metadata["format"]["duration"])
# Use silence detection filter
reader = (
ffmpeg.input(str(audio_path))
.filter("silencedetect", n="-10dB", d=min_silence_length)
.output("pipe:", format="null")
.run_async(pipe_stderr=True)
)
segments = []
cur_start = 0.0
while True:
line = reader.stderr.readline().decode("utf-8")
if not line:
break
match = silence_end_re.search(line)
if match:
silence_end, silence_dur = match.group("end"), match.group("dur")
split_at = float(silence_end) - (float(silence_dur) / 2)
if (split_at - cur_start) < min_segment_length:
continue
segments.append(AudioSegment(
start=cur_start,
end=split_at,
file_path=audio_path,
duration=split_at - cur_start
))
cur_start = split_at
# Handle the last segment
if duration > cur_start:
segments.append(AudioSegment(
start=cur_start,
end=duration,
file_path=audio_path,
duration=duration - cur_start
))
print(f"Audio split into {len(segments)} segments")
return segments
except Exception as e:
raise AudioProcessingError(f"Audio segmentation failed: {str(e)}")
async def process_audio_segment(
self,
segment: AudioSegment,
model_name: str = "turbo",
language: Optional[str] = None,
enable_speaker_diarization: bool = False
) -> Dict[str, Any]:
"""
Process a single audio segment
"""
try:
# Create temporary segment file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
temp_path = temp_file.name
# Extract segment using ffmpeg
(
ffmpeg.input(segment.file_path, ss=segment.start, t=segment.duration)
.output(temp_path)
.overwrite_output()
.run(quiet=True)
)
# Transcribe segment
result = await self.transcriber.transcribe(
audio_file_path=temp_path,
model_size=model_name,
language=language,
enable_speaker_diarization=enable_speaker_diarization
)
# Adjust timestamps to match original audio
adjusted_segments = []
for seg in result.segments:
adjusted_segments.append(TranscriptionSegment(
start=seg.start + segment.start,
end=seg.end + segment.start,
text=seg.text,
speaker=seg.speaker,
confidence=seg.confidence
))
# Clean up temp file
pathlib.Path(temp_path).unlink(missing_ok=True)
return {
"segment_start": segment.start,
"segment_end": segment.end,
"text": result.text,
"segments": [
{
"start": seg.start,
"end": seg.end,
"text": seg.text,
"speaker": seg.speaker,
"confidence": seg.confidence
} for seg in adjusted_segments
],
"language_detected": result.language,
"model_used": result.model_used
}
except Exception as e:
raise AudioProcessingError(f"Segment processing failed: {str(e)}")
async def process_complete_audio(
self,
audio_path: str,
model_name: str = "turbo",
language: Optional[str] = None,
enable_speaker_diarization: bool = False,
min_segment_length: float = 30.0
) -> Dict[str, Any]:
"""
Process complete audio file with intelligent segmentation
"""
try:
print(f"🚀 Starting complete audio processing: {audio_path}")
# Get audio metadata
metadata = ffmpeg.probe(audio_path)
total_duration = float(metadata["format"]["duration"])
# Split audio into segments
segments = await self.split_audio_by_silence(
audio_path=audio_path,
min_segment_length=min_segment_length,
min_silence_length=1.0
)
# Process segments in parallel (with limited concurrency)
semaphore = asyncio.Semaphore(3) # Limit concurrent processing
async def process_segment_with_semaphore(segment):
async with semaphore:
return await self.process_audio_segment(
segment=segment,
model_name=model_name,
language=language,
enable_speaker_diarization=enable_speaker_diarization
)
# Process all segments
segment_results = await asyncio.gather(*[
process_segment_with_semaphore(segment) for segment in segments
])
# Combine results
all_segments = []
combined_text = []
for result in segment_results:
all_segments.extend(result["segments"])
if result["text"].strip():
combined_text.append(result["text"].strip())
# Apply speaker identification if enabled
if enable_speaker_diarization and self.speaker_service:
try:
speaker_segments = await self.speaker_service.identify_speakers_in_audio(
audio_path=audio_path,
transcription_segments=all_segments
)
# Map transcription to speakers
all_segments = await self.speaker_service.map_transcription_to_speakers(
transcription_segments=all_segments,
speaker_segments=speaker_segments
)
except Exception as e:
print(f"⚠️ Speaker identification failed: {e}")
return {
"text": " ".join(combined_text),
"segments": all_segments,
"audio_duration": total_duration,
"segment_count": len(all_segments),
"processing_segments": len(segments),
"language_detected": segment_results[0]["language_detected"] if segment_results else "unknown",
"model_used": model_name,
"speaker_diarization_enabled": enable_speaker_diarization,
"processing_status": "success"
}
except Exception as e:
raise AudioProcessingError(f"Complete audio processing failed: {str(e)}")
def get_supported_models(self) -> List[str]:
"""Get supported transcription models"""
return self.transcriber.get_supported_models()
def get_supported_languages(self) -> List[str]:
"""Get supported languages"""
return self.transcriber.get_supported_languages()