ciyidogan commited on
Commit
c51c470
Β·
verified Β·
1 Parent(s): e579c02

Update stt_google.py

Browse files
Files changed (1) hide show
  1. stt_google.py +64 -61
stt_google.py CHANGED
@@ -1,19 +1,35 @@
1
  """
2
  Google Cloud Speech-to-Text Implementation
3
  """
4
-
5
  import os
6
  import asyncio
7
  from typing import AsyncIterator, Optional, List
8
- from google.cloud import speech_v1p1beta1 as speech
9
- from google.api_core import exceptions
10
- from utils import log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from stt_interface import STTInterface, STTConfig, TranscriptionResult
12
 
13
  class GoogleCloudSTT(STTInterface):
14
  """Google Cloud Speech-to-Text implementation"""
15
 
16
  def __init__(self, credentials_path: str):
 
 
 
17
  if credentials_path and os.path.exists(credentials_path):
18
  os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
19
  log(f"βœ… Google credentials set from: {credentials_path}")
@@ -35,12 +51,7 @@ class GoogleCloudSTT(STTInterface):
35
  enable_automatic_punctuation=config.enable_punctuation,
36
  enable_word_time_offsets=config.enable_word_timestamps,
37
  model=config.model,
38
- use_enhanced=config.use_enhanced,
39
- metadata=speech.RecognitionMetadata(
40
- interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
41
- recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
42
- audio_topic="general"
43
- )
44
  )
45
 
46
  self.streaming_config = speech.StreamingRecognitionConfig(
@@ -50,67 +61,54 @@ class GoogleCloudSTT(STTInterface):
50
  )
51
 
52
  self.is_streaming = True
53
- log("βœ… Google STT streaming session started")
54
 
55
  except Exception as e:
56
  log(f"❌ Failed to start Google STT streaming: {e}")
57
  raise
58
-
59
  async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
60
  """Stream audio chunk and get transcription results"""
61
  if not self.is_streaming:
62
- log("⚠️ STT streaming not started")
63
- return
64
-
65
  try:
66
- # Add audio chunk to queue
67
  await self.audio_queue.put(audio_chunk)
68
 
69
- # Process audio stream
70
- async def audio_generator():
71
- while self.is_streaming:
72
- chunk = await self.audio_queue.get()
73
- yield speech.StreamingRecognizeRequest(audio_content=chunk)
74
-
75
- # Get responses
76
- responses = await self.client.streaming_recognize(
77
- self.streaming_config,
78
- audio_generator()
79
- )
80
 
81
- async for response in responses:
82
- for result in response.results:
83
- if result.alternatives:
84
- yield TranscriptionResult(
85
- text=result.alternatives[0].transcript,
86
- is_final=result.is_final,
87
- confidence=result.alternatives[0].confidence,
88
- timestamp=asyncio.get_event_loop().time()
89
- )
90
-
91
- except exceptions.OutOfRange:
92
- log("⚠️ Google STT: Exceeded maximum audio duration")
93
- self.is_streaming = False
94
  except Exception as e:
95
  log(f"❌ Google STT streaming error: {e}")
96
  raise
97
-
98
  async def stop_streaming(self) -> Optional[TranscriptionResult]:
99
  """Stop streaming and get final result"""
100
- self.is_streaming = False
101
- log("πŸ›‘ Google STT streaming stopped")
102
-
103
- # Process any remaining audio in queue
104
- if not self.audio_queue.empty():
105
- # TODO: Process remaining audio
106
- pass
107
 
108
- return None
109
-
 
 
 
 
 
 
 
 
 
110
  def supports_realtime(self) -> bool:
111
- """Google Cloud Speech supports real-time streaming"""
112
  return True
113
-
114
  def get_supported_languages(self) -> List[str]:
115
  """Get list of supported language codes"""
116
  return [
@@ -126,18 +124,23 @@ class GoogleCloudSTT(STTInterface):
126
  "ja-JP", # Japanese
127
  "ko-KR", # Korean
128
  "zh-CN", # Chinese (Simplified)
 
129
  ]
130
-
131
- def _get_encoding(self, encoding: str):
132
- """Convert encoding string to Google Cloud Speech encoding"""
 
 
 
 
 
 
 
133
  encoding_map = {
 
134
  "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
135
  "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
136
- "MULAW": speech.RecognitionConfig.AudioEncoding.MULAW,
137
- "AMR": speech.RecognitionConfig.AudioEncoding.AMR,
138
- "AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB,
139
  "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
140
- "SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE,
141
- "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
142
  }
143
- return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)
 
1
  """
2
  Google Cloud Speech-to-Text Implementation
3
  """
 
4
  import os
5
  import asyncio
6
  from typing import AsyncIterator, Optional, List
7
+ from datetime import datetime
8
+ import sys
9
+
10
+ def log(message: str):
11
+ timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
12
+ print(f"[{timestamp}] {message}")
13
+ sys.stdout.flush()
14
+
15
+ # Import Google Cloud Speech only if available
16
+ try:
17
+ from google.cloud import speech_v1p1beta1 as speech
18
+ from google.api_core import exceptions
19
+ GOOGLE_SPEECH_AVAILABLE = True
20
+ except ImportError:
21
+ GOOGLE_SPEECH_AVAILABLE = False
22
+ log("⚠️ Google Cloud Speech library not installed")
23
+
24
  from stt_interface import STTInterface, STTConfig, TranscriptionResult
25
 
26
  class GoogleCloudSTT(STTInterface):
27
  """Google Cloud Speech-to-Text implementation"""
28
 
29
  def __init__(self, credentials_path: str):
30
+ if not GOOGLE_SPEECH_AVAILABLE:
31
+ raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech")
32
+
33
  if credentials_path and os.path.exists(credentials_path):
34
  os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
35
  log(f"βœ… Google credentials set from: {credentials_path}")
 
51
  enable_automatic_punctuation=config.enable_punctuation,
52
  enable_word_time_offsets=config.enable_word_timestamps,
53
  model=config.model,
54
+ use_enhanced=config.use_enhanced
 
 
 
 
 
55
  )
56
 
57
  self.streaming_config = speech.StreamingRecognitionConfig(
 
61
  )
62
 
63
  self.is_streaming = True
64
+ log("βœ… Google STT streaming started")
65
 
66
  except Exception as e:
67
  log(f"❌ Failed to start Google STT streaming: {e}")
68
  raise
69
+
70
  async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
71
  """Stream audio chunk and get transcription results"""
72
  if not self.is_streaming:
73
+ raise RuntimeError("Streaming not started. Call start_streaming() first.")
74
+
 
75
  try:
76
+ # Add audio to queue
77
  await self.audio_queue.put(audio_chunk)
78
 
79
+ # Process with Google STT
80
+ request = speech.StreamingRecognizeRequest(audio_content=audio_chunk)
81
+
82
+ # This is a simplified version - actual implementation would need
83
+ # proper streaming handling with Google's API
84
+ # For now, return empty iterator
85
+ return
86
+ yield # Make it a generator
 
 
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  except Exception as e:
89
  log(f"❌ Google STT streaming error: {e}")
90
  raise
91
+
92
  async def stop_streaming(self) -> Optional[TranscriptionResult]:
93
  """Stop streaming and get final result"""
94
+ if not self.is_streaming:
95
+ return None
 
 
 
 
 
96
 
97
+ try:
98
+ self.is_streaming = False
99
+ log("βœ… Google STT streaming stopped")
100
+
101
+ # Return final result if any
102
+ return None
103
+
104
+ except Exception as e:
105
+ log(f"❌ Failed to stop Google STT streaming: {e}")
106
+ raise
107
+
108
  def supports_realtime(self) -> bool:
109
+ """Google Cloud STT supports real-time streaming"""
110
  return True
111
+
112
  def get_supported_languages(self) -> List[str]:
113
  """Get list of supported language codes"""
114
  return [
 
124
  "ja-JP", # Japanese
125
  "ko-KR", # Korean
126
  "zh-CN", # Chinese (Simplified)
127
+ "ar-SA", # Arabic
128
  ]
129
+
130
+ def get_provider_name(self) -> str:
131
+ """Get provider name"""
132
+ return "google"
133
+
134
+ def _get_encoding(self, encoding_str: str):
135
+ """Convert encoding string to Google Speech enum"""
136
+ if not GOOGLE_SPEECH_AVAILABLE:
137
+ return None
138
+
139
  encoding_map = {
140
+ "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
141
  "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
142
  "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
143
+ "MP3": speech.RecognitionConfig.AudioEncoding.MP3,
 
 
144
  "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
 
 
145
  }
146
+ return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)