|
""" |
|
Audio Processing Service - integrates audio segmentation and transcription |
|
""" |
|
|
|
import re |
|
import asyncio |
|
import pathlib |
|
import tempfile |
|
from typing import Dict, Any, List, Optional |
|
|
|
import ffmpeg |
|
|
|
from ..interfaces.audio_processor import IAudioProcessor, AudioSegment |
|
from ..interfaces.transcriber import ITranscriber |
|
from ..interfaces.speaker_manager import ISpeakerIdentificationService |
|
from ..utils.config import AudioProcessingConfig |
|
from ..utils.errors import AudioProcessingError |
|
from ..models.transcription import TranscriptionResponse, TranscriptionSegment |
|
|
|
|
|
class AudioProcessingService(IAudioProcessor): |
|
"""High-level audio processing service that coordinates transcription and speaker identification""" |
|
|
|
def __init__( |
|
self, |
|
transcriber: ITranscriber, |
|
speaker_service: Optional[ISpeakerIdentificationService] = None, |
|
config: Optional[AudioProcessingConfig] = None |
|
): |
|
self.transcriber = transcriber |
|
self.speaker_service = speaker_service |
|
self.config = config or AudioProcessingConfig() |
|
|
|
async def split_audio_by_silence( |
|
self, |
|
audio_path: str, |
|
min_segment_length: float = 30.0, |
|
min_silence_length: float = 1.0 |
|
) -> List[AudioSegment]: |
|
""" |
|
Intelligently split audio using FFmpeg's silencedetect filter |
|
""" |
|
try: |
|
silence_end_re = re.compile( |
|
r" silence_end: (?P<end>[0-9]+(\.?[0-9]*)) \| silence_duration: (?P<dur>[0-9]+(\.?[0-9]*))" |
|
) |
|
|
|
|
|
metadata = ffmpeg.probe(audio_path) |
|
duration = float(metadata["format"]["duration"]) |
|
|
|
|
|
reader = ( |
|
ffmpeg.input(str(audio_path)) |
|
.filter("silencedetect", n="-10dB", d=min_silence_length) |
|
.output("pipe:", format="null") |
|
.run_async(pipe_stderr=True) |
|
) |
|
|
|
segments = [] |
|
cur_start = 0.0 |
|
|
|
while True: |
|
line = reader.stderr.readline().decode("utf-8") |
|
if not line: |
|
break |
|
|
|
match = silence_end_re.search(line) |
|
if match: |
|
silence_end, silence_dur = match.group("end"), match.group("dur") |
|
split_at = float(silence_end) - (float(silence_dur) / 2) |
|
|
|
if (split_at - cur_start) < min_segment_length: |
|
continue |
|
|
|
segments.append(AudioSegment( |
|
start=cur_start, |
|
end=split_at, |
|
file_path=audio_path, |
|
duration=split_at - cur_start |
|
)) |
|
cur_start = split_at |
|
|
|
|
|
if duration > cur_start: |
|
segments.append(AudioSegment( |
|
start=cur_start, |
|
end=duration, |
|
file_path=audio_path, |
|
duration=duration - cur_start |
|
)) |
|
|
|
print(f"Audio split into {len(segments)} segments") |
|
return segments |
|
|
|
except Exception as e: |
|
raise AudioProcessingError(f"Audio segmentation failed: {str(e)}") |
|
|
|
async def process_audio_segment( |
|
self, |
|
segment: AudioSegment, |
|
model_name: str = "turbo", |
|
language: Optional[str] = None, |
|
enable_speaker_diarization: bool = False |
|
) -> Dict[str, Any]: |
|
""" |
|
Process a single audio segment |
|
""" |
|
try: |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: |
|
temp_path = temp_file.name |
|
|
|
|
|
( |
|
ffmpeg.input(segment.file_path, ss=segment.start, t=segment.duration) |
|
.output(temp_path) |
|
.overwrite_output() |
|
.run(quiet=True) |
|
) |
|
|
|
|
|
result = await self.transcriber.transcribe( |
|
audio_file_path=temp_path, |
|
model_size=model_name, |
|
language=language, |
|
enable_speaker_diarization=enable_speaker_diarization |
|
) |
|
|
|
|
|
adjusted_segments = [] |
|
for seg in result.segments: |
|
adjusted_segments.append(TranscriptionSegment( |
|
start=seg.start + segment.start, |
|
end=seg.end + segment.start, |
|
text=seg.text, |
|
speaker=seg.speaker, |
|
confidence=seg.confidence |
|
)) |
|
|
|
|
|
pathlib.Path(temp_path).unlink(missing_ok=True) |
|
|
|
return { |
|
"segment_start": segment.start, |
|
"segment_end": segment.end, |
|
"text": result.text, |
|
"segments": [ |
|
{ |
|
"start": seg.start, |
|
"end": seg.end, |
|
"text": seg.text, |
|
"speaker": seg.speaker, |
|
"confidence": seg.confidence |
|
} for seg in adjusted_segments |
|
], |
|
"language_detected": result.language, |
|
"model_used": result.model_used |
|
} |
|
|
|
except Exception as e: |
|
raise AudioProcessingError(f"Segment processing failed: {str(e)}") |
|
|
|
async def process_complete_audio( |
|
self, |
|
audio_path: str, |
|
model_name: str = "turbo", |
|
language: Optional[str] = None, |
|
enable_speaker_diarization: bool = False, |
|
min_segment_length: float = 30.0 |
|
) -> Dict[str, Any]: |
|
""" |
|
Process complete audio file with intelligent segmentation |
|
""" |
|
try: |
|
print(f"🚀 Starting complete audio processing: {audio_path}") |
|
|
|
|
|
metadata = ffmpeg.probe(audio_path) |
|
total_duration = float(metadata["format"]["duration"]) |
|
|
|
|
|
segments = await self.split_audio_by_silence( |
|
audio_path=audio_path, |
|
min_segment_length=min_segment_length, |
|
min_silence_length=1.0 |
|
) |
|
|
|
|
|
semaphore = asyncio.Semaphore(3) |
|
|
|
async def process_segment_with_semaphore(segment): |
|
async with semaphore: |
|
return await self.process_audio_segment( |
|
segment=segment, |
|
model_name=model_name, |
|
language=language, |
|
enable_speaker_diarization=enable_speaker_diarization |
|
) |
|
|
|
|
|
segment_results = await asyncio.gather(*[ |
|
process_segment_with_semaphore(segment) for segment in segments |
|
]) |
|
|
|
|
|
all_segments = [] |
|
combined_text = [] |
|
|
|
for result in segment_results: |
|
all_segments.extend(result["segments"]) |
|
if result["text"].strip(): |
|
combined_text.append(result["text"].strip()) |
|
|
|
|
|
if enable_speaker_diarization and self.speaker_service: |
|
try: |
|
speaker_segments = await self.speaker_service.identify_speakers_in_audio( |
|
audio_path=audio_path, |
|
transcription_segments=all_segments |
|
) |
|
|
|
|
|
all_segments = await self.speaker_service.map_transcription_to_speakers( |
|
transcription_segments=all_segments, |
|
speaker_segments=speaker_segments |
|
) |
|
except Exception as e: |
|
print(f"⚠️ Speaker identification failed: {e}") |
|
|
|
return { |
|
"text": " ".join(combined_text), |
|
"segments": all_segments, |
|
"audio_duration": total_duration, |
|
"segment_count": len(all_segments), |
|
"processing_segments": len(segments), |
|
"language_detected": segment_results[0]["language_detected"] if segment_results else "unknown", |
|
"model_used": model_name, |
|
"speaker_diarization_enabled": enable_speaker_diarization, |
|
"processing_status": "success" |
|
} |
|
|
|
except Exception as e: |
|
raise AudioProcessingError(f"Complete audio processing failed: {str(e)}") |
|
|
|
def get_supported_models(self) -> List[str]: |
|
"""Get supported transcription models""" |
|
return self.transcriber.get_supported_models() |
|
|
|
def get_supported_languages(self) -> List[str]: |
|
"""Get supported languages""" |
|
return self.transcriber.get_supported_languages() |