reoapae / src /services /speechtotext.py
puzan789's picture
updated
cad1ad0
# import threading
# import pyaudio
# import wave
# import io
# import time
# from groq import Groq
# from pydub import AudioSegment
# import numpy as np
# class SpeechTotext:
# def __init__(self):
# self.client = Groq()
# self.is_recording = False
# self.frames = []
# self.chunk = 1024
# self.format = pyaudio.paInt16
# self.channels = 1
# self.rate = 44100
# self.p = pyaudio.PyAudio()
# # Silence detection parameters
# self.silence_threshold = -35.0 # Adjusted threshold to be more lenient
# self.silence_duration = 3.0 # seconds
# self.buffer_duration = 0.1 # seconds for each audio chunk analysis
# self.silent_chunks = 0
# self.chunks_per_second = int(1 / self.buffer_duration)
# def detect_silence(self, audio_segment):
# """Check if the audio chunk is silent using pydub"""
# return audio_segment.dBFS < self.silence_threshold
# def record_audio(self):
# stream = self.p.open(
# format=self.format,
# channels=self.channels,
# rate=self.rate,
# input=True,
# frames_per_buffer=self.chunk
# )
# self.frames = []
# buffer_samples = int(self.buffer_duration * self.rate)
# while self.is_recording:
# # Read enough chunks to fill our buffer duration
# buffer_data = b''
# chunks_needed = max(1, int(buffer_samples / self.chunk))
# for _ in range(chunks_needed):
# data = stream.read(self.chunk)
# buffer_data += data
# self.frames.append(data)
# # Convert the buffer to pydub AudioSegment
# audio_buffer = AudioSegment(
# data=buffer_data,
# sample_width=self.p.get_sample_size(self.format),
# frame_rate=self.rate,
# channels=self.channels
# )
# # Check for silence
# if self.detect_silence(audio_buffer):
# self.silent_chunks += 1
# if self.silent_chunks >= self.silence_duration * self.chunks_per_second:
# print(f"Silence detected for {self.silence_duration} seconds, stopping recording...")
# self.is_recording = False
# break
# else:
# self.silent_chunks = 0 # Reset silent chunk counter when sound is detected
# stream.stop_stream()
# stream.close()
# def start_recording(self):
# """Start recording audio"""
# self.is_recording = True
# self.silent_chunks = 0
# threading.Thread(target=self.record_audio).start()
# def stop_recording(self):
# """Stop recording audio and transcribe"""
# self.is_recording = False
# print("Recording stopped")
# # Save the recorded audio to a BytesIO object
# wav_buffer = io.BytesIO()
# with wave.open(wav_buffer, 'wb') as wf:
# wf.setnchannels(self.channels)
# wf.setsampwidth(self.p.get_sample_size(self.format))
# wf.setframerate(self.rate)
# wf.writeframes(b''.join(self.frames))
# # Rewind the buffer and transcribe
# wav_buffer.seek(0)
# try:
# transcription = self.client.audio.transcriptions.create(
# file=("audio.wav", wav_buffer),
# model="whisper-large-v3-turbo"
# )
# print(f"Transcript: {transcription.text}")
# except Exception as e:
# print(f"Error while transcribing audio: {str(e)}")
# finally:
# wav_buffer.close()
# def cleanup(self):
# """Cleanup PyAudio"""
# self.p.terminate()
# if __name__ == "__main__":
# recorder = SpeechTotext()
# try:
# print("Starting recording... (will stop after 3 seconds of silence)")
# recorder.start_recording()
# # Wait for recording to finish
# while recorder.is_recording:
# time.sleep(0.1)
# recorder.stop_recording()
# finally:
# recorder.cleanup()
# Upper one using pydub to detect silence if needed in future versions
import pyaudio
import wave
import io
from array import array
from groq import Groq
class SpeechToText:
def __init__(self):
self.client = Groq()
self.chunk = 4096
self.format = pyaudio.paInt16
self.channels = 1
self.rate = 16000
self.silence_threshold = 1000
self.silence_duration = 3.0
self.frames_per_chunk = self.chunk / self.rate
self.chunks_for_silence = int(self.silence_duration / self.frames_per_chunk)
def record_and_transcribe(self):
"""Records audio until 3 seconds of silence and returns the transcription."""
p = pyaudio.PyAudio() # Create a new PyAudio instance for each request (previously when declared on constructor cant execute more than once)
stream = p.open(
format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer=self.chunk
)
frames = []
silent_chunk_counter = 0
print("Recording started... (will stop after 3 seconds of silence)")
while True:
try:
data = stream.read(self.chunk, exception_on_overflow=False)
frames.append(data)
# Detect silence
audio_data = array('h', data)
if max(abs(x) for x in audio_data) < self.silence_threshold:
silent_chunk_counter += 1
if silent_chunk_counter >= self.chunks_for_silence:
print(f"Detected {self.silence_duration} seconds of silence, stopping...")
break
else:
silent_chunk_counter = 0
except IOError as e:
print(f"Error recording: {e}")
break
stream.stop_stream()
stream.close()
p.terminate() # Ensure PyAudio is completely closed
wav_buffer = io.BytesIO()
try:
with wave.open(wav_buffer, 'wb') as wf:
wf.setnchannels(self.channels)
wf.setsampwidth(p.get_sample_size(self.format))
wf.setframerate(self.rate)
wf.writeframes(b''.join(frames))
wav_buffer.seek(0)
transcription = self.client.audio.transcriptions.create(
file=("audio.wav", wav_buffer),
model="whisper-large-v3-turbo"
)
return transcription.text
except Exception as e:
print(f"Error transcribing: {e}")
return str(e)
finally:
wav_buffer.close()
if __name__ == "__main__":
recorder = SpeechToText()
transcribed_text = recorder.record_and_transcribe()
if transcribed_text:
print(f"Transcription: {transcribed_text}")