File size: 2,766 Bytes
b8a4e79
13db51f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8a4e79
13db51f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8a4e79
 
13db51f
 
b8a4e79
13db51f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8a4e79
13db51f
 
b8a4e79
13db51f
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import whisper as whp
import numpy as np
import logging
import io
import librosa

logger = logging.getLogger(__name__)

class LanguageDetector:
    def __init__(self, model_name="tiny"):
        """
        Initialize the language detector with a Whisper model.
        
        Args:
            model_name (str): Name of the Whisper model to use. Default is "tiny" which is sufficient for language detection.
        """
        self.model = whp.load_model(model_name)
        logger.info(f"Loaded Whisper model {model_name} for language detection")

    def detect_language_from_file(self, audio_file_path):
        """
        Detect language from an audio file.
        
        Args:
            audio_file_path (str): Path to the audio file
            
        Returns:
            str: Detected language code (e.g., "en", "fr", etc.)
            float: Confidence score
        """
        try:
            # Load and preprocess audio
            audio = whp.load_audio(audio_file_path)
            audio = whp.pad_or_trim(audio)
            
            # Make log-Mel spectrogram
            mel = whp.log_mel_spectrogram(audio).to(self.model.device)
            
            # Detect language
            _, probs = self.model.detect_language(mel)
            detected_lang = max(probs, key=probs.get)
            confidence = probs[detected_lang]
            
            return detected_lang, confidence
            
        except Exception as e:
            logger.error(f"Error in language detection: {e}")
            raise

    def detect_language_from_bytes(self, audio_bytes):
        """
        Detect language from audio bytes.
        
        Args:
            audio_bytes (bytes): Audio data in bytes
            
        Returns:
            str: Detected language code (e.g., "en", "fr", etc.)
            float: Confidence score
        """
        try:
            # Convert bytes to numpy array using librosa
            audio_data = io.BytesIO(audio_bytes)
            audio, sr = librosa.load(audio_data, sr=16000)
            
            # Convert to format expected by Whisper
            audio = (audio * 32768).astype(np.int16)
            
            # Load and preprocess audio
            audio = whp.pad_or_trim(audio)
            
            # Make log-Mel spectrogram
            mel = whp.log_mel_spectrogram(audio).to(self.model.device)
            
            # Detect language
            _, probs = self.model.detect_language(mel)
            detected_lang = max(probs, key=probs.get)
            confidence = probs[detected_lang]
            
            return detected_lang, confidence
            
        except Exception as e:
            logger.error(f"Error in language detection: {e}")
            raise