File size: 5,390 Bytes
b0a4866
 
 
 
 
 
c51c470
 
8d8ad71
c51c470
 
 
 
 
 
 
 
8d8ad71
c51c470
b0a4866
 
 
 
 
 
c51c470
 
 
b0a4866
 
8d8ad71
b0a4866
8d8ad71
b0a4866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c51c470
b0a4866
 
 
 
 
 
 
 
 
8d8ad71
b0a4866
 
8d8ad71
b0a4866
c51c470
b0a4866
 
 
c51c470
 
b0a4866
c51c470
b0a4866
 
c51c470
 
 
 
 
 
 
 
b0a4866
 
8d8ad71
b0a4866
c51c470
b0a4866
 
c51c470
 
b0a4866
c51c470
 
8d8ad71
c51c470
 
 
 
 
8d8ad71
c51c470
 
b0a4866
c51c470
b0a4866
c51c470
b0a4866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c51c470
b0a4866
c51c470
 
 
 
 
 
 
 
 
 
b0a4866
c51c470
b0a4866
 
c51c470
b0a4866
 
c51c470
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Google Cloud Speech-to-Text Implementation
"""
import os
import asyncio
from typing import AsyncIterator, Optional, List
from datetime import datetime
import sys
from logger import log_info, log_error, log_debug, log_warning

# Import Google Cloud Speech only if available
try:
    from google.cloud import speech_v1p1beta1 as speech
    from google.api_core import exceptions
    GOOGLE_SPEECH_AVAILABLE = True
except ImportError:
    GOOGLE_SPEECH_AVAILABLE = False
    log_info("⚠️ Google Cloud Speech library not installed")

from stt_interface import STTInterface, STTConfig, TranscriptionResult

class GoogleCloudSTT(STTInterface):
    """Google Cloud Speech-to-Text implementation"""
    
    def __init__(self, credentials_path: str):
        if not GOOGLE_SPEECH_AVAILABLE:
            raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech")
            
        if credentials_path and os.path.exists(credentials_path):
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
            log_info(f"βœ… Google credentials set from: {credentials_path}")
        else:
            log_info("⚠️ Google credentials path not found, using default credentials")
            
        self.client = speech.SpeechAsyncClient()
        self.streaming_config = None
        self.is_streaming = False
        self.audio_queue = asyncio.Queue()
        
    async def start_streaming(self, config: STTConfig) -> None:
        """Initialize streaming session"""
        try:
            recognition_config = speech.RecognitionConfig(
                encoding=self._get_encoding(config.encoding),
                sample_rate_hertz=config.sample_rate,
                language_code=config.language,
                enable_automatic_punctuation=config.enable_punctuation,
                enable_word_time_offsets=config.enable_word_timestamps,
                model=config.model,
                use_enhanced=config.use_enhanced
            )
            
            self.streaming_config = speech.StreamingRecognitionConfig(
                config=recognition_config,
                interim_results=config.interim_results,
                single_utterance=config.single_utterance
            )
            
            self.is_streaming = True
            log_info("βœ… Google STT streaming started")
            
        except Exception as e:
            log_error("❌ Failed to start Google STT streaming", e)
            raise
    
    async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
        """Stream audio chunk and get transcription results"""
        if not self.is_streaming:
            raise RuntimeError("Streaming not started. Call start_streaming() first.")
        
        try:
            # Add audio to queue
            await self.audio_queue.put(audio_chunk)
            
            # Process with Google STT
            request = speech.StreamingRecognizeRequest(audio_content=audio_chunk)
            
            # This is a simplified version - actual implementation would need
            # proper streaming handling with Google's API
            # For now, return empty iterator
            return
            yield  # Make it a generator
            
        except Exception as e:
            log_error("❌ Google STT streaming error", e)
            raise
    
    async def stop_streaming(self) -> Optional[TranscriptionResult]:
        """Stop streaming and get final result"""
        if not self.is_streaming:
            return None
            
        try:
            self.is_streaming = False
            log_info("βœ… Google STT streaming stopped")
            
            # Return final result if any
            return None
            
        except Exception as e:
            log_error("❌ Failed to stop Google STT streaming", e)
            raise
    
    def supports_realtime(self) -> bool:
        """Google Cloud STT supports real-time streaming"""
        return True
    
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        return [
            "tr-TR",  # Turkish
            "en-US",  # English (US)
            "en-GB",  # English (UK)
            "de-DE",  # German
            "fr-FR",  # French
            "es-ES",  # Spanish
            "it-IT",  # Italian
            "pt-BR",  # Portuguese (Brazil)
            "ru-RU",  # Russian
            "ja-JP",  # Japanese
            "ko-KR",  # Korean
            "zh-CN",  # Chinese (Simplified)
            "ar-SA",  # Arabic
        ]
    
    def get_provider_name(self) -> str:
        """Get provider name"""
        return "google"
    
    def _get_encoding(self, encoding_str: str):
        """Convert encoding string to Google Speech enum"""
        if not GOOGLE_SPEECH_AVAILABLE:
            return None
            
        encoding_map = {
            "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
            "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
            "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
            "MP3": speech.RecognitionConfig.AudioEncoding.MP3,
            "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
        }
        return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)