Afrinetwork7
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,6 @@ import base64
|
|
8 |
import logging
|
9 |
import torch
|
10 |
import librosa
|
11 |
-
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
12 |
from pathlib import Path
|
13 |
import magic # For MIME type detection
|
14 |
from pydub import AudioSegment
|
@@ -17,7 +16,7 @@ from pydub import AudioSegment
|
|
17 |
from asr import transcribe, ASR_LANGUAGES
|
18 |
from tts import synthesize, TTS_LANGUAGES
|
19 |
from lid import identify
|
20 |
-
from asr import ASR_SAMPLING_RATE
|
21 |
|
22 |
# Configure logging
|
23 |
logging.basicConfig(level=logging.INFO)
|
@@ -78,11 +77,27 @@ async def transcribe_audio(request: AudioRequest):
|
|
78 |
@app.post("/synthesize")
|
79 |
async def synthesize_speech(request: TTSRequest):
|
80 |
try:
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
# Convert numpy array to bytes
|
83 |
buffer = io.BytesIO()
|
84 |
-
sf.write(buffer, audio,
|
85 |
buffer.seek(0)
|
|
|
86 |
return FileResponse(
|
87 |
buffer,
|
88 |
media_type="audio/wav",
|
@@ -117,4 +132,4 @@ async def get_tts_languages():
|
|
117 |
return JSONResponse(content=TTS_LANGUAGES)
|
118 |
except Exception as e:
|
119 |
logger.error(f"Error in get_tts_languages: {str(e)}")
|
120 |
-
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|
|
|
8 |
import logging
|
9 |
import torch
|
10 |
import librosa
|
|
|
11 |
from pathlib import Path
|
12 |
import magic # For MIME type detection
|
13 |
from pydub import AudioSegment
|
|
|
16 |
from asr import transcribe, ASR_LANGUAGES
|
17 |
from tts import synthesize, TTS_LANGUAGES
|
18 |
from lid import identify
|
19 |
+
from asr import ASR_SAMPLING_RATE
|
20 |
|
21 |
# Configure logging
|
22 |
logging.basicConfig(level=logging.INFO)
|
|
|
77 |
@app.post("/synthesize")
|
78 |
async def synthesize_speech(request: TTSRequest):
|
79 |
try:
|
80 |
+
logger.info(f"Synthesizing speech for text: {request.text}, language: {request.language}, speed: {request.speed}")
|
81 |
+
result, filtered_text = synthesize(request.text, request.language, request.speed)
|
82 |
+
logger.info(f"Synthesis complete. Filtered text: {filtered_text}")
|
83 |
+
|
84 |
+
sample_rate, audio = result
|
85 |
+
logger.info(f"Sample rate: {sample_rate}, Audio shape: {audio.shape}, Audio dtype: {audio.dtype}")
|
86 |
+
|
87 |
+
# Ensure audio is a numpy array with the correct dtype
|
88 |
+
audio = np.array(audio, dtype=np.float32)
|
89 |
+
|
90 |
+
# Normalize audio to [-1, 1] range
|
91 |
+
audio = audio / np.max(np.abs(audio))
|
92 |
+
|
93 |
+
# Convert to int16 for WAV file
|
94 |
+
audio = (audio * 32767).astype(np.int16)
|
95 |
+
|
96 |
# Convert numpy array to bytes
|
97 |
buffer = io.BytesIO()
|
98 |
+
sf.write(buffer, audio, sample_rate, format='wav')
|
99 |
buffer.seek(0)
|
100 |
+
|
101 |
return FileResponse(
|
102 |
buffer,
|
103 |
media_type="audio/wav",
|
|
|
132 |
return JSONResponse(content=TTS_LANGUAGES)
|
133 |
except Exception as e:
|
134 |
logger.error(f"Error in get_tts_languages: {str(e)}")
|
135 |
+
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
|