File size: 2,643 Bytes
d89ceaa
 
 
 
 
252fde6
 
d89ceaa
252fde6
d89ceaa
252fde6
d89ceaa
 
 
 
 
252fde6
 
 
 
 
 
 
 
 
 
d89ceaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252fde6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc4a1e0
252fde6
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import tempfile
import logging
from typing import Optional

import torch
import librosa
import edge_tts
from transformers import WhisperProcessor, WhisperForConditionalGeneration

from config.config import VOICE, FALLBACK_VOICES


logger = logging.getLogger(__name__)

# Whisper model for speech to text
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-tiny",
    local_files_only=False
)
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-tiny",
    local_files_only=False,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float32,
).to("cpu")

# Voice selection handling
async def get_valid_voice() -> str:
    available_voices = await edge_tts.list_voices()
    voice_names = [VOICE] + FALLBACK_VOICES
    
    available_voice_names = {v["ShortName"] for v in available_voices}
    for voice in voice_names:
        if voice in available_voice_names:
            return voice
            
    raise RuntimeError("No valid voice found")

# Text-to-speech conversion using Edge TTS
async def generate_speech(text: str) -> Optional[str]:
    if not text or not isinstance(text, str):
        raise ValueError("Invalid text input")

    voice = await get_valid_voice()
    logger.info(f"Using voice: {voice}")

    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(tmp_path)

    if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
        raise RuntimeError("Speech file empty or not created")

    logger.info(f"Speech generated successfully: {tmp_path}")
    return tmp_path

# Speech-to-text using Whisper
async def transcribe(audio_file: str) -> str:
    audio, sr = librosa.load(
        audio_file,
        sr=16000,
        mono=True,
        duration=30
    )
    
    inputs = processor(
        audio,
        sampling_rate=sr,
        return_tensors="pt",
        return_attention_mask=True
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_features=inputs.input_features,
            attention_mask=inputs.attention_mask,
            language="en",
            task="transcribe",
            max_length=448,
            temperature=0.0
        )
        
        transcription = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0].strip()

    logger.info(f"Transcribed text: {transcription}")
    return transcription