|
import torch
|
|
import librosa
|
|
import noisereduce as nr
|
|
import numpy as np
|
|
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer
|
|
|
|
class AudioSpeechNERPipeline:
|
|
def __init__(
|
|
self,
|
|
stt_model_name='abduaziz/whisper-small-uzbek',
|
|
ner_model_name='abduaziz/roberta-ner-uzbek',
|
|
stt_language='uz',
|
|
chunk_duration=30
|
|
):
|
|
|
|
self.stt_pipeline = None
|
|
self.ner_pipeline = None
|
|
self.stt_model_name = stt_model_name
|
|
self.ner_model_name = ner_model_name
|
|
self.chunk_duration = chunk_duration
|
|
|
|
def load_whisper_model(self, model_name='abduaziz/whisper-small-uzbek'):
|
|
try:
|
|
|
|
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Uzbek", task="transcribe")
|
|
|
|
|
|
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
|
|
|
return model, processor
|
|
|
|
except Exception as e:
|
|
print(f"Error loading Whisper model: {e}")
|
|
raise
|
|
|
|
def _load_pipelines(self):
|
|
"""Lazy load pipelines only when needed"""
|
|
if self.stt_pipeline is None:
|
|
|
|
model, processor = self.load_whisper_model(self.stt_model_name)
|
|
tokenizer = AutoTokenizer.from_pretrained('abduaziz/whisper-small-uzbek')
|
|
self.stt_pipeline = pipeline(
|
|
"automatic-speech-recognition",
|
|
model=model,
|
|
processor=processor,
|
|
feature_extractor = processor.feature_extractor,
|
|
tokenizer=tokenizer,
|
|
return_timestamps=True
|
|
)
|
|
if self.ner_pipeline is None:
|
|
self.ner_pipeline = pipeline(
|
|
task="ner",
|
|
model=self.ner_model_name
|
|
)
|
|
|
|
def chunk_audio(self, audio, sample_rate):
|
|
"""More efficient audio chunking"""
|
|
chunk_samples = self.chunk_duration * sample_rate
|
|
return [
|
|
{'array': audio[start:start+chunk_samples], 'sampling_rate': sample_rate}
|
|
for start in range(0, len(audio), chunk_samples)
|
|
]
|
|
|
|
def transcribe_audio(self, audio_path):
|
|
"""Enhanced audio transcription with better error handling"""
|
|
self._load_pipelines()
|
|
|
|
audio, sample_rate = librosa.load(audio_path, sr=16000)
|
|
preprocessed_audio = preprocess_audio(audio, sr=sample_rate)
|
|
|
|
if preprocessed_audio is None:
|
|
raise ValueError("Audio preprocessing failed")
|
|
|
|
if len(preprocessed_audio) / sample_rate > self.chunk_duration:
|
|
chunks = self.chunk_audio(preprocessed_audio, sample_rate)
|
|
transcriptions = [
|
|
self.stt_pipeline(chunk)['text'] for chunk in chunks
|
|
]
|
|
return " ".join(transcriptions)
|
|
|
|
return self.stt_pipeline({
|
|
'array': preprocessed_audio,
|
|
'sampling_rate': sample_rate
|
|
})['text']
|
|
|
|
def process_audio(self, audio_path):
|
|
"""Streamlined audio processing"""
|
|
transcription = self.transcribe_audio(audio_path)
|
|
|
|
self._load_pipelines()
|
|
entities = self.ner_pipeline(transcription)
|
|
|
|
return transcription, entities
|
|
|
|
def preprocess_audio(audio_array, sr=16000):
|
|
"""Improved audio preprocessing with better type handling"""
|
|
try:
|
|
|
|
if isinstance(audio_array, torch.Tensor):
|
|
audio_array = audio_array.numpy()
|
|
|
|
|
|
if audio_array.ndim > 1:
|
|
audio_array = audio_array.mean(axis=0)
|
|
|
|
|
|
noise_reduced = nr.reduce_noise(
|
|
y=audio_array,
|
|
sr=sr,
|
|
prop_decrease=0.5,
|
|
n_std_thresh_stationary=1.5
|
|
)
|
|
|
|
normalized_audio = librosa.util.normalize(noise_reduced)
|
|
trimmed_audio, _ = librosa.effects.trim(normalized_audio, top_db=25)
|
|
|
|
return trimmed_audio.astype(np.float32)
|
|
|
|
except Exception as e:
|
|
print(f"Audio preprocessing error: {e}")
|
|
return None
|
|
|