File size: 3,440 Bytes
b5df735 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
"""
Transcription models
"""
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from enum import Enum
from .base import BaseRequest, BaseResponse, OperationStatus
class ModelSize(str, Enum):
"""Whisper model sizes"""
TINY = "tiny"
BASE = "base"
SMALL = "small"
MEDIUM = "medium"
LARGE = "large"
TURBO = "turbo"
class OutputFormat(str, Enum):
"""Output formats"""
TXT = "txt"
SRT = "srt"
JSON = "json"
@dataclass
class TranscriptionRequest(BaseRequest):
"""Request model for transcription"""
audio_file_path: str
model_size: ModelSize = ModelSize.TURBO
language: Optional[str] = None
output_format: OutputFormat = OutputFormat.SRT
enable_speaker_diarization: bool = False
@dataclass
class TranscriptionSegment:
"""Individual transcription segment"""
start: float
end: float
text: str
speaker: Optional[str] = None
confidence: Optional[float] = None
@dataclass
class SpeakerInfo:
"""Speaker diarization information"""
enabled: bool = False
global_speaker_count: int = 0
speaker_mapping: Dict[str, str] = field(default_factory=dict)
speaker_summary: Dict[str, Any] = field(default_factory=dict)
@dataclass
class TranscriptionFiles:
"""Generated transcription files"""
txt_file_path: Optional[str] = None
srt_file_path: Optional[str] = None
json_file_path: Optional[str] = None
@property
def all_files(self) -> List[str]:
"""Get all non-None file paths"""
return [f for f in [self.txt_file_path, self.srt_file_path, self.json_file_path] if f]
@dataclass
class TranscriptionMetrics:
"""Transcription processing metrics"""
audio_duration: float = 0.0
processing_time: float = 0.0
segment_count: int = 0
model_used: str = ""
language_detected: str = "unknown"
@dataclass
class TranscriptionResponse(BaseResponse):
"""Response model for transcription"""
audio_file: str = ""
files: TranscriptionFiles = field(default_factory=TranscriptionFiles)
segments: List[TranscriptionSegment] = field(default_factory=list)
speaker_info: SpeakerInfo = field(default_factory=SpeakerInfo)
metrics: TranscriptionMetrics = field(default_factory=TranscriptionMetrics)
@classmethod
def success(
cls,
audio_file: str,
files: TranscriptionFiles,
segments: List[TranscriptionSegment],
metrics: TranscriptionMetrics,
speaker_info: Optional[SpeakerInfo] = None,
message: str = "转录完成"
) -> "TranscriptionResponse":
"""Create successful response"""
return cls(
status=OperationStatus.SUCCESS,
message=message,
audio_file=audio_file,
files=files,
segments=segments,
speaker_info=speaker_info or SpeakerInfo(),
metrics=metrics
)
@classmethod
def failed(
cls,
audio_file: str,
error_message: str,
error_code: str = "TRANSCRIPTION_ERROR",
error_details: Optional[Dict[str, Any]] = None
) -> "TranscriptionResponse":
"""Create failed response"""
return cls(
status=OperationStatus.FAILED,
message=error_message,
error_code=error_code,
error_details=error_details,
audio_file=audio_file
) |