ciyidogan commited on
Commit
0b9eed5
Β·
verified Β·
1 Parent(s): b728d57

Create stt_base.py

Browse files
Files changed (1) hide show
  1. stt/stt_base.py +206 -0
stt/stt_base.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base STT Implementation
3
+ ======================
4
+ Common audio processing and validation for all STT providers
5
+ """
6
+ import struct
7
+ from typing import Optional, Tuple, List
8
+ from datetime import datetime
9
+ from abc import ABC, abstractmethod
10
+
11
+ from .stt_interface import STTInterface, STTConfig, TranscriptionResult
12
+ from utils.logger import log_info, log_error, log_debug, log_warning
13
+
14
+
15
+ class STTBase(STTInterface, ABC):
16
+ """Base class for all STT implementations with common audio processing"""
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
22
+ """Main transcription method with preprocessing"""
23
+ try:
24
+ # 1. Validate input
25
+ if not audio_data:
26
+ log_warning("⚠️ No audio data provided")
27
+ return None
28
+
29
+ log_info(f"πŸ“Š Transcribing {len(audio_data)} bytes of audio")
30
+
31
+ # 2. Analyze and validate audio
32
+ analysis_result = self._analyze_audio(audio_data, config.sample_rate)
33
+ if not analysis_result.is_valid:
34
+ log_warning(f"⚠️ Audio validation failed: {analysis_result.reason}")
35
+ return None
36
+
37
+ # 3. Preprocess audio if needed
38
+ processed_audio = self._preprocess_audio(audio_data, config)
39
+
40
+ # 4. Call provider-specific implementation
41
+ return await self._transcribe_impl(processed_audio, config, analysis_result)
42
+
43
+ except Exception as e:
44
+ log_error(f"❌ Error during transcription: {str(e)}")
45
+ import traceback
46
+ log_error(f"Traceback: {traceback.format_exc()}")
47
+ return None
48
+
49
+ @abstractmethod
50
+ async def _transcribe_impl(self, audio_data: bytes, config: STTConfig, analysis: 'AudioAnalysis') -> Optional[TranscriptionResult]:
51
+ """Provider-specific transcription implementation"""
52
+ pass
53
+
54
+ def _analyze_audio(self, audio_data: bytes, sample_rate: int) -> 'AudioAnalysis':
55
+ """Analyze audio quality and content"""
56
+ try:
57
+ samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
58
+ total_samples = len(samples)
59
+
60
+ # Basic statistics
61
+ non_zero_samples = [s for s in samples if s != 0]
62
+ zero_count = total_samples - len(non_zero_samples)
63
+
64
+ if non_zero_samples:
65
+ avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
66
+ max_amplitude = max(abs(s) for s in non_zero_samples)
67
+ else:
68
+ avg_amplitude = 0
69
+ max_amplitude = 0
70
+
71
+ log_info(f"πŸ” Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_count/total_samples:.1%})")
72
+ log_info(f"πŸ” Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
73
+
74
+ # Section analysis (10 sections)
75
+ section_size = total_samples // 10
76
+ sections = []
77
+
78
+ for i in range(10):
79
+ start_idx = i * section_size
80
+ end_idx = (i + 1) * section_size if i < 9 else total_samples
81
+ section = samples[start_idx:end_idx]
82
+
83
+ section_non_zero = [s for s in section if s != 0]
84
+ section_max = max(abs(s) for s in section_non_zero) if section_non_zero else 0
85
+ section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero) if section_non_zero else 0
86
+ zero_ratio = (len(section) - len(section_non_zero)) / len(section)
87
+
88
+ sections.append({
89
+ 'max': section_max,
90
+ 'avg': section_avg,
91
+ 'zero_ratio': zero_ratio
92
+ })
93
+
94
+ log_info(f" Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={zero_ratio:.1%}")
95
+
96
+ # Find speech start
97
+ speech_start_idx = self._find_speech_start(samples, sample_rate)
98
+ speech_start_time = speech_start_idx / sample_rate if speech_start_idx >= 0 else -1
99
+
100
+ if speech_start_idx >= 0:
101
+ log_info(f"🎀 Speech detected starting at sample {speech_start_idx} ({speech_start_time:.2f}s)")
102
+ else:
103
+ log_warning("⚠️ No speech detected above threshold in entire audio")
104
+
105
+ # Validation
106
+ is_valid = True
107
+ reason = ""
108
+
109
+ if max_amplitude < 100:
110
+ is_valid = False
111
+ reason = f"Audio appears silent: max_amplitude={max_amplitude}"
112
+ elif zero_count / total_samples > 0.95:
113
+ is_valid = False
114
+ reason = f"Audio is mostly zeros: {zero_count/total_samples:.1%}"
115
+ elif speech_start_idx < 0:
116
+ is_valid = False
117
+ reason = "No speech detected"
118
+
119
+ return AudioAnalysis(
120
+ total_samples=total_samples,
121
+ sample_rate=sample_rate,
122
+ zero_count=zero_count,
123
+ avg_amplitude=avg_amplitude,
124
+ max_amplitude=max_amplitude,
125
+ sections=sections,
126
+ speech_start_idx=speech_start_idx,
127
+ speech_start_time=speech_start_time,
128
+ is_valid=is_valid,
129
+ reason=reason
130
+ )
131
+
132
+ except Exception as e:
133
+ log_error(f"Audio analysis failed: {e}")
134
+ return AudioAnalysis(
135
+ total_samples=0,
136
+ sample_rate=sample_rate,
137
+ is_valid=False,
138
+ reason=f"Analysis failed: {e}"
139
+ )
140
+
141
+ def _find_speech_start(self, samples: List[int], sample_rate: int, threshold: int = 500) -> int:
142
+ """Find the starting point of speech in audio"""
143
+ window_size = 100
144
+
145
+ for i in range(0, len(samples) - window_size, window_size):
146
+ window = samples[i:i + window_size]
147
+ rms = (sum(s * s for s in window) / window_size) ** 0.5
148
+
149
+ if rms > threshold:
150
+ return i
151
+
152
+ return -1
153
+
154
+ def _preprocess_audio(self, audio_data: bytes, config: STTConfig) -> bytes:
155
+ """Preprocess audio if needed (can be overridden by providers)"""
156
+ # Default: no preprocessing
157
+ return audio_data
158
+
159
+ def _clean_audio_silence(self, audio_data: bytes, threshold: int = 50) -> bytes:
160
+ """Remove leading/trailing silence"""
161
+ try:
162
+ samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
163
+
164
+ # Find first non-silent sample
165
+ start_idx = 0
166
+ for i, sample in enumerate(samples):
167
+ if abs(sample) > threshold:
168
+ start_idx = i
169
+ break
170
+
171
+ # Find last non-silent sample
172
+ end_idx = len(samples) - 1
173
+ for i in range(len(samples) - 1, -1, -1):
174
+ if abs(samples[i]) > threshold:
175
+ end_idx = i
176
+ break
177
+
178
+ # Add padding
179
+ start_idx = max(0, start_idx - 100)
180
+ end_idx = min(len(samples) - 1, end_idx + 100)
181
+
182
+ # Convert back
183
+ cleaned_samples = samples[start_idx:end_idx + 1]
184
+ cleaned_audio = struct.pack(f'{len(cleaned_samples)}h', *cleaned_samples)
185
+
186
+ log_debug(f"Audio cleaning: {len(audio_data)} β†’ {len(cleaned_audio)} bytes")
187
+ return cleaned_audio
188
+
189
+ except Exception as e:
190
+ log_warning(f"Audio cleaning failed: {e}, using original")
191
+ return audio_data
192
+
193
+
194
+ @dataclass
195
+ class AudioAnalysis:
196
+ """Audio analysis results"""
197
+ total_samples: int = 0
198
+ sample_rate: int = 16000
199
+ zero_count: int = 0
200
+ avg_amplitude: float = 0.0
201
+ max_amplitude: int = 0
202
+ sections: List[dict] = field(default_factory=list)
203
+ speech_start_idx: int = -1
204
+ speech_start_time: float = -1.0
205
+ is_valid: bool = False
206
+ reason: str = ""