Spaces:

abduaziz
/

stt_ner

Sleeping

App Files Files Community

stt_ner / pipe.py

abduaziz

Upload folder using huggingface_hub

d4fb41c verified 7 months ago

raw

history blame contribute delete

4.49 kB

	import torch
	import librosa
	import noisereduce as nr
	import numpy as np
	from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer

	class AudioSpeechNERPipeline:
	def __init__(
	self,
	stt_model_name='abduaziz/whisper-small-uzbek',
	ner_model_name='abduaziz/roberta-ner-uzbek',
	stt_language='uz',
	chunk_duration=30
	):
	# Use lazy loading for pipelines
	self.stt_pipeline = None
	self.ner_pipeline = None
	self.stt_model_name = stt_model_name
	self.ner_model_name = ner_model_name
	self.chunk_duration = chunk_duration

	def load_whisper_model(self, model_name='abduaziz/whisper-small-uzbek'):
	try:
	# Load processor
	processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Uzbek", task="transcribe")

	# Load model
	model = WhisperForConditionalGeneration.from_pretrained(model_name)

	return model, processor

	except Exception as e:
	print(f"Error loading Whisper model: {e}")
	raise

	def _load_pipelines(self):
	"""Lazy load pipelines only when needed"""
	if self.stt_pipeline is None:
	# Load Whisper model and processor explicitly
	model, processor = self.load_whisper_model(self.stt_model_name)
	tokenizer = AutoTokenizer.from_pretrained('abduaziz/whisper-small-uzbek')
	self.stt_pipeline = pipeline(
	"automatic-speech-recognition",
	model=model,
	processor=processor,
	feature_extractor = processor.feature_extractor,
	tokenizer=tokenizer,
	return_timestamps=True
	)
	if self.ner_pipeline is None:
	self.ner_pipeline = pipeline(
	task="ner",
	model=self.ner_model_name
	)

	def chunk_audio(self, audio, sample_rate):
	"""More efficient audio chunking"""
	chunk_samples = self.chunk_duration * sample_rate
	return [
	{'array': audio[start:start+chunk_samples], 'sampling_rate': sample_rate}
	for start in range(0, len(audio), chunk_samples)
	]

	def transcribe_audio(self, audio_path):
	"""Enhanced audio transcription with better error handling"""
	self._load_pipelines()

	audio, sample_rate = librosa.load(audio_path, sr=16000)
	preprocessed_audio = preprocess_audio(audio, sr=sample_rate)

	if preprocessed_audio is None:
	raise ValueError("Audio preprocessing failed")

	if len(preprocessed_audio) / sample_rate > self.chunk_duration:
	chunks = self.chunk_audio(preprocessed_audio, sample_rate)
	transcriptions = [
	self.stt_pipeline(chunk)['text'] for chunk in chunks
	]
	return " ".join(transcriptions)

	return self.stt_pipeline({
	'array': preprocessed_audio,
	'sampling_rate': sample_rate
	})['text']

	def process_audio(self, audio_path):
	"""Streamlined audio processing"""
	transcription = self.transcribe_audio(audio_path)

	self._load_pipelines()
	entities = self.ner_pipeline(transcription)

	return transcription, entities

	def preprocess_audio(audio_array, sr=16000):
	"""Improved audio preprocessing with better type handling"""
	try:
	# Handle tensor or numpy array input
	if isinstance(audio_array, torch.Tensor):
	audio_array = audio_array.numpy()

	# Convert stereo to mono
	if audio_array.ndim > 1:
	audio_array = audio_array.mean(axis=0)

	# Noise reduction and normalization
	noise_reduced = nr.reduce_noise(
	y=audio_array,
	sr=sr,
	prop_decrease=0.5,
	n_std_thresh_stationary=1.5
	)

	normalized_audio = librosa.util.normalize(noise_reduced)
	trimmed_audio, _ = librosa.effects.trim(normalized_audio, top_db=25)

	return trimmed_audio.astype(np.float32)

	except Exception as e:
	print(f"Audio preprocessing error: {e}")
	return None