stt_ner / pipe.py
abduaziz's picture
Upload folder using huggingface_hub
d4fb41c verified
import torch
import librosa
import noisereduce as nr
import numpy as np
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer
class AudioSpeechNERPipeline:
def __init__(
self,
stt_model_name='abduaziz/whisper-small-uzbek',
ner_model_name='abduaziz/roberta-ner-uzbek',
stt_language='uz',
chunk_duration=30
):
# Use lazy loading for pipelines
self.stt_pipeline = None
self.ner_pipeline = None
self.stt_model_name = stt_model_name
self.ner_model_name = ner_model_name
self.chunk_duration = chunk_duration
def load_whisper_model(self, model_name='abduaziz/whisper-small-uzbek'):
try:
# Load processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Uzbek", task="transcribe")
# Load model
model = WhisperForConditionalGeneration.from_pretrained(model_name)
return model, processor
except Exception as e:
print(f"Error loading Whisper model: {e}")
raise
def _load_pipelines(self):
"""Lazy load pipelines only when needed"""
if self.stt_pipeline is None:
# Load Whisper model and processor explicitly
model, processor = self.load_whisper_model(self.stt_model_name)
tokenizer = AutoTokenizer.from_pretrained('abduaziz/whisper-small-uzbek')
self.stt_pipeline = pipeline(
"automatic-speech-recognition",
model=model,
processor=processor,
feature_extractor = processor.feature_extractor,
tokenizer=tokenizer,
return_timestamps=True
)
if self.ner_pipeline is None:
self.ner_pipeline = pipeline(
task="ner",
model=self.ner_model_name
)
def chunk_audio(self, audio, sample_rate):
"""More efficient audio chunking"""
chunk_samples = self.chunk_duration * sample_rate
return [
{'array': audio[start:start+chunk_samples], 'sampling_rate': sample_rate}
for start in range(0, len(audio), chunk_samples)
]
def transcribe_audio(self, audio_path):
"""Enhanced audio transcription with better error handling"""
self._load_pipelines()
audio, sample_rate = librosa.load(audio_path, sr=16000)
preprocessed_audio = preprocess_audio(audio, sr=sample_rate)
if preprocessed_audio is None:
raise ValueError("Audio preprocessing failed")
if len(preprocessed_audio) / sample_rate > self.chunk_duration:
chunks = self.chunk_audio(preprocessed_audio, sample_rate)
transcriptions = [
self.stt_pipeline(chunk)['text'] for chunk in chunks
]
return " ".join(transcriptions)
return self.stt_pipeline({
'array': preprocessed_audio,
'sampling_rate': sample_rate
})['text']
def process_audio(self, audio_path):
"""Streamlined audio processing"""
transcription = self.transcribe_audio(audio_path)
self._load_pipelines()
entities = self.ner_pipeline(transcription)
return transcription, entities
def preprocess_audio(audio_array, sr=16000):
"""Improved audio preprocessing with better type handling"""
try:
# Handle tensor or numpy array input
if isinstance(audio_array, torch.Tensor):
audio_array = audio_array.numpy()
# Convert stereo to mono
if audio_array.ndim > 1:
audio_array = audio_array.mean(axis=0)
# Noise reduction and normalization
noise_reduced = nr.reduce_noise(
y=audio_array,
sr=sr,
prop_decrease=0.5,
n_std_thresh_stationary=1.5
)
normalized_audio = librosa.util.normalize(noise_reduced)
trimmed_audio, _ = librosa.effects.trim(normalized_audio, top_db=25)
return trimmed_audio.astype(np.float32)
except Exception as e:
print(f"Audio preprocessing error: {e}")
return None