Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,50 @@
|
|
1 |
-
|
2 |
-
import
|
|
|
|
|
3 |
import io
|
4 |
-
import os
|
5 |
import tempfile
|
6 |
-
|
7 |
-
|
8 |
|
9 |
# Set up logging
|
10 |
logging.basicConfig(level=logging.DEBUG)
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
try:
|
@@ -166,75 +201,43 @@ def chat():
|
|
166 |
def handle_voice():
|
167 |
try:
|
168 |
if 'audio' not in request.files:
|
169 |
-
logger.error("No audio file in request")
|
170 |
return jsonify({'error': 'No audio file provided'}), 400
|
171 |
|
172 |
audio_file = request.files['audio']
|
173 |
conversation_id = request.form.get('conversation_id', str(uuid.uuid4()))
|
174 |
|
175 |
-
#
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
-
with tempfile.TemporaryDirectory() as temp_dir:
|
180 |
-
# Save incoming audio
|
181 |
-
input_path = os.path.join(temp_dir, 'input.webm')
|
182 |
-
audio_file.save(input_path)
|
183 |
-
logger.debug(f"Saved audio file to: {input_path}")
|
184 |
-
|
185 |
-
try:
|
186 |
-
# Convert audio using pydub
|
187 |
-
audio = AudioSegment.from_file(input_path)
|
188 |
-
output_path = os.path.join(temp_dir, 'output.wav')
|
189 |
-
audio.export(output_path, format="wav",
|
190 |
-
parameters=["-ac", "1", "-ar", "16000"])
|
191 |
-
logger.debug("Audio conversion successful")
|
192 |
-
|
193 |
-
# Initialize recognition if not already done
|
194 |
-
if not hasattr(app, 'recognizer'):
|
195 |
-
app.recognizer = init_speech_recognition()
|
196 |
-
|
197 |
-
if not app.recognizer:
|
198 |
-
return jsonify({'error': 'Speech recognition unavailable'}), 503
|
199 |
-
|
200 |
-
# Perform speech recognition
|
201 |
-
with sr.AudioFile(output_path) as source:
|
202 |
-
audio_data = app.recognizer.record(source)
|
203 |
-
text = app.recognizer.recognize_google(audio_data)
|
204 |
-
logger.debug(f"Speech recognition result: {text}")
|
205 |
-
|
206 |
-
if not text:
|
207 |
-
return jsonify({'error': 'Could not transcribe audio'}), 400
|
208 |
-
|
209 |
-
# Get chatbot response
|
210 |
-
response = chat_with_groq(text, conversation_id)
|
211 |
-
|
212 |
-
# Generate voice response
|
213 |
-
audio_io = text_to_speech(response)
|
214 |
-
result = {
|
215 |
-
'text': text,
|
216 |
-
'response': response,
|
217 |
-
'conversation_id': conversation_id
|
218 |
-
}
|
219 |
-
|
220 |
-
if audio_io:
|
221 |
-
audio_base64 = base64.b64encode(audio_io.getvalue()).decode('utf-8')
|
222 |
-
result['voice_response'] = audio_base64
|
223 |
-
|
224 |
-
return jsonify(result)
|
225 |
-
|
226 |
-
except sr.UnknownValueError:
|
227 |
-
logger.error("Speech recognition could not understand audio")
|
228 |
-
return jsonify({'error': 'Could not understand audio'}), 400
|
229 |
-
except sr.RequestError as e:
|
230 |
-
logger.error(f"Speech recognition service error: {e}")
|
231 |
-
return jsonify({'error': 'Speech recognition service error'}), 503
|
232 |
-
except Exception as e:
|
233 |
-
logger.error(f"Audio processing error: {e}")
|
234 |
-
return jsonify({'error': f'Error processing audio: {str(e)}'}), 400
|
235 |
-
|
236 |
except Exception as e:
|
237 |
-
|
238 |
-
return jsonify({'error': str(e)}),
|
239 |
if __name__ == '__main__':
|
240 |
app.run(host='0.0.0.0', port=7860)
|
|
|
1 |
+
import sounddevice as sd
|
2 |
+
import scipy.io.wavfile as wav
|
3 |
+
import numpy as np
|
4 |
+
from pydub import AudioSegment
|
5 |
import io
|
|
|
6 |
import tempfile
|
7 |
+
import os
|
8 |
+
|
9 |
|
10 |
# Set up logging
|
11 |
logging.basicConfig(level=logging.DEBUG)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
+
class AudioProcessor:
|
15 |
+
def __init__(self):
|
16 |
+
self.sample_rate = 16000
|
17 |
+
self.channels = 1
|
18 |
+
|
19 |
+
def process_audio(self, audio_file):
|
20 |
+
"""Process incoming audio file and convert to proper format"""
|
21 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
22 |
+
# Save incoming audio
|
23 |
+
input_path = os.path.join(temp_dir, 'input.webm')
|
24 |
+
audio_file.save(input_path)
|
25 |
+
|
26 |
+
# Convert to WAV using pydub
|
27 |
+
audio = AudioSegment.from_file(input_path)
|
28 |
+
audio = audio.set_channels(self.channels)
|
29 |
+
audio = audio.set_frame_rate(self.sample_rate)
|
30 |
+
|
31 |
+
output_path = os.path.join(temp_dir, 'output.wav')
|
32 |
+
audio.export(output_path, format='wav')
|
33 |
+
|
34 |
+
# Read the processed audio file
|
35 |
+
return output_path
|
36 |
+
|
37 |
+
def record_audio(self, duration=5):
|
38 |
+
"""Record audio using sounddevice"""
|
39 |
+
recording = sd.rec(
|
40 |
+
int(duration * self.sample_rate),
|
41 |
+
samplerate=self.sample_rate,
|
42 |
+
channels=self.channels
|
43 |
+
)
|
44 |
+
sd.wait()
|
45 |
+
return recording
|
46 |
+
|
47 |
+
|
48 |
|
49 |
|
50 |
try:
|
|
|
201 |
def handle_voice():
|
202 |
try:
|
203 |
if 'audio' not in request.files:
|
|
|
204 |
return jsonify({'error': 'No audio file provided'}), 400
|
205 |
|
206 |
audio_file = request.files['audio']
|
207 |
conversation_id = request.form.get('conversation_id', str(uuid.uuid4()))
|
208 |
|
209 |
+
# Process audio
|
210 |
+
audio_processor = AudioProcessor()
|
211 |
+
wav_path = audio_processor.process_audio(audio_file)
|
212 |
+
|
213 |
+
# Perform speech recognition
|
214 |
+
recognizer = sr.Recognizer()
|
215 |
+
with sr.AudioFile(wav_path) as source:
|
216 |
+
audio_data = recognizer.record(source)
|
217 |
+
text = recognizer.recognize_google(audio_data)
|
218 |
+
|
219 |
+
if not text:
|
220 |
+
return jsonify({'error': 'Could not transcribe audio'}), 400
|
221 |
+
|
222 |
+
# Get chatbot response
|
223 |
+
response = chat_with_groq(text, conversation_id)
|
224 |
+
|
225 |
+
# Generate voice response
|
226 |
+
audio_io = text_to_speech(response)
|
227 |
+
result = {
|
228 |
+
'text': text,
|
229 |
+
'response': response,
|
230 |
+
'conversation_id': conversation_id
|
231 |
+
}
|
232 |
+
|
233 |
+
if audio_io:
|
234 |
+
audio_base64 = base64.b64encode(audio_io.getvalue()).decode('utf-8')
|
235 |
+
result['voice_response'] = audio_base64
|
236 |
+
|
237 |
+
return jsonify(result)
|
238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
except Exception as e:
|
240 |
+
print(f"Error in handle_voice: {str(e)}")
|
241 |
+
return jsonify({'error': str(e)}), 400
|
242 |
if __name__ == '__main__':
|
243 |
app.run(host='0.0.0.0', port=7860)
|