stt_ner / pipe.py
abduaziz's picture
Upload folder using huggingface_hub
70d6a1c verified
raw
history blame
3.7 kB
import os
import librosa
from transformers import pipeline
labels = {0: 'O',
1: 'B-DATE',
2: 'B-EVENT',
3: 'B-LOC',
4: 'B-ORG',
5: 'B-PER',
6: 'I-DATE',
7: 'I-EVENT',
8: 'I-LOC',
9: 'I-ORG',
10: 'I-PER'}
class AudioSpeechNERPipeline:
def __init__(self,
stt_model_name='abduaziz/whisper-small-uz',
ner_model_name='abduaziz/bert-ner-uz',
stt_language='uz'):
# Initialize Speech-to-Text pipeline with timestamp support
self.stt_pipeline = pipeline(
task="automatic-speech-recognition",
model=stt_model_name,
return_timestamps=True # Enable timestamp support
)
# Initialize NER pipeline
self.ner_pipeline = pipeline(
task="ner",
model=ner_model_name
)
def chunk_audio(self, audio_path, chunk_duration=30):
"""
Chunk long audio files into 30-second segments
"""
# Load audio file
audio, sample_rate = librosa.load(audio_path, sr=16000)
# Calculate chunk size
chunk_samples = chunk_duration * sample_rate
# Create chunks
chunks = []
for start in range(0, len(audio), chunk_samples):
chunk = audio[start:start+chunk_samples]
chunks.append({
'array': chunk,
'sampling_rate': 16000
})
return chunks
def transcribe_audio(self, audio_path):
"""
Handle audio transcription for files longer than 30 seconds
"""
# Check audio length
audio, sample_rate = librosa.load(audio_path, sr=16000)
# If audio is longer than 30 seconds, chunk it
if len(audio) / sample_rate > 30:
audio_chunks = self.chunk_audio(audio_path)
transcriptions = []
for chunk in audio_chunks:
# Transcribe each chunk
chunk_transcription = self.stt_pipeline(chunk)
transcriptions.append(chunk_transcription['text'])
# Combine transcriptions
full_transcription = " ".join(transcriptions)
else:
# Process audio normally for short files
full_transcription = self.stt_pipeline({
'array': audio,
'sampling_rate': 16000
})['text']
return full_transcription
def process_audio(self, audio_path):
# Transcribe audio
transcription = self.transcribe_audio(audio_path)
# Extract named entities
entities = self.ner_pipeline(transcription)
return transcription, entities
def replace_ner(entities):
processed_entities = []
for entity in entities:
number = int(entity['entity'].split("_")[-1])
# Skip entities with number 0
if number == 0:
continue
# Create a copy of the entity and update the label
updated_entity = entity.copy()
updated_entity['entity'] = labels[number]
processed_entities.append(updated_entity)
return processed_entities
def process_audio_pipeline(audio):
"""
Gradio interface function to process audio
"""
# Initialize pipeline
pipeline = AudioSpeechNERPipeline()
try:
# Process the audio
transcription, entities = pipeline.process_audio(audio)
entities = replace_ner(entities)
return transcription, entities
except Exception as e:
return f"Error processing audio: {str(e)}", ""