Spaces:

abduaziz
/

stt_ner

Sleeping

App Files Files Community

abduaziz commited on Dec 16, 2024

Commit

d4fb41c

verified ·

1 Parent(s): 70d6a1c

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitignore +2 -0
.gradio/certificate.pem +31 -0
app.py +99 -11
pipe.py +104 -106
requirements.txt +2 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv
2	+ __pycache__

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py CHANGED Viewed

@@ -1,28 +1,116 @@
-import gradio as gr
 import os
-from pipe import process_audio_pipeline, AudioSpeechNERPipeline
 from huggingface_hub import login
 def create_gradio_interface():
-    # Create Gradio interface
     iface = gr.Interface(
         fn=process_audio_pipeline,
-        inputs=gr.Audio(type="filepath", label="Upload Audio"),
         outputs=[
             gr.Textbox(label="Transcription"),
-            gr.Textbox(label="Named Entities")
         ],
-        title="Uzbek Speech Recognition and Named Entity Recognition",
-        description="Upload an Uzbek audio file (MP3 or WAV) to transcribe and extract named entities."
     )
     return iface
 def main():
-    # Create and launch the Gradio interface
     demo = create_gradio_interface()
-    demo.launch(share=True)
 if __name__ == "__main__":
-    login(token=os.getenv('HF_TOKEN'), new_session=False)
-    AudioSpeechNERPipeline()
     main()

 import os
+import gradio as gr
 from huggingface_hub import login
+from pipe import AudioSpeechNERPipeline
+import html
+# Optimized Labels Dictionary
+LABELS = {
+    0: 'O', 1: 'B-DATE', 2: 'B-EVENT', 3: 'B-LOC',
+    4: 'B-ORG', 5: 'B-PER', 6: 'I-DATE', 7: 'I-EVENT',
+    8: 'I-LOC', 9: 'I-ORG', 10: 'I-PER'
+}
+def process_audio_pipeline(audio):
+    """Robust Gradio processing function"""
+    pipeline = AudioSpeechNERPipeline()
+    try:
+        transcription, entities = pipeline.process_audio(audio)
+        highlighted_text = highlight_entities(transcription, entities)
+        return transcription, highlighted_text
+    except Exception as e:
+        return f"Error processing audio: {str(e)}", ""
+def highlight_entities(transcription, entities):
+    """Enhanced entity highlighting with a legend."""
+    # Map entity labels to human-readable labels if needed
+    processed_entities = [
+        {**entity, 'label': LABELS[int(entity['entity'].split("_")[-1])]}
+        for entity in entities if int(entity['entity'].split("_")[-1]) != 0
+    ]
+    # Sort entities by their start position to avoid overlapping issues
+    processed_entities.sort(key=lambda x: x.get('start', 0))
+    # Escape transcription for HTML safety
+    transcription = html.escape(transcription)
+    highlighted_text = transcription
+    offset = 0  # Track how much the text length changes due to added HTML
+    # Define color coding for entity types
+    colors = {
+        'B-PER': 'blue', 'I-PER': 'blue',
+        'B-ORG': 'green', 'I-ORG': 'green',
+        'B-LOC': 'red', 'I-LOC': 'red',
+        'B-DATE': 'purple', 'I-DATE': 'purple',
+        'B-EVENT': 'orange', 'I-EVENT': 'orange'
+    }
+    for entity in processed_entities:
+        start = entity.get('start', 0) + offset
+        end = entity.get('end', 0) + offset
+        label = entity['label']
+        color = colors.get(label, 'black')
+        # Wrap the entity text with a styled span
+        highlighted_part = (
+            f'<span style="background-color: {color}; color: white; '
+            f'padding: 2px; border-radius: 3px;">'
+            f'{highlighted_text[start:end]}</span>'
+        )
+        # Replace text in the highlighted_text with the HTML
+        highlighted_text = (
+            highlighted_text[:start] + highlighted_part +
+            highlighted_text[end:]
+        )
+        # Update offset to account for added HTML
+        offset += len(highlighted_part) - (end - start)
+    # Create a legend for the labels and their colors
+    legend = '<br><br><strong>Legend:</strong><br>'
+    legend += ''.join(
+        f'<span style="background-color: {color}; color: white; '
+        f'padding: 2px; border-radius: 3px; margin-right: 10px;">{label}</span>'
+        for label, color in colors.items()
+    )
+    return highlighted_text + legend
 def create_gradio_interface():
+    """Enhanced Gradio interface with improved styling"""
     iface = gr.Interface(
         fn=process_audio_pipeline,
+        inputs=gr.Audio(type="filepath", label="Upload Uzbek Audio"),
         outputs=[
             gr.Textbox(label="Transcription"),
+            gr.HTML(label="Named Entities")  # Changed to HTML for highlighting
         ],
+        title="🎙️ Uzbek Speech Recognition & NER",
+        description=(
+            "Upload an Uzbek audio file to transcribe and "
+            "visualize named entities with color-coded highlighting. "
+            "Supports MP3 and WAV formats."
+        ),
+        css=".gradio-container { background-color: #f0f0f0; }"
     )
     return iface
 def main():
+    """Main execution function"""
     demo = create_gradio_interface()
+    demo.launch()
 if __name__ == "__main__":
+    # Optional: Handle HuggingFace login more securely
+    token = os.getenv('HF_TOKEN')
+    if token:
+        login(token=token, new_session=False)
     main()

pipe.py CHANGED Viewed

@@ -1,123 +1,121 @@
-import os
 import librosa
-from transformers import pipeline
-labels = {0: 'O',
- 1: 'B-DATE',
- 2: 'B-EVENT',
- 3: 'B-LOC',
- 4: 'B-ORG',
- 5: 'B-PER',
- 6: 'I-DATE',
- 7: 'I-EVENT',
- 8: 'I-LOC',
- 9: 'I-ORG',
- 10: 'I-PER'}
 class AudioSpeechNERPipeline:
-    def __init__(self,
-                 stt_model_name='abduaziz/whisper-small-uz',
-                 ner_model_name='abduaziz/bert-ner-uz',
-                 stt_language='uz'):
-        # Initialize Speech-to-Text pipeline with timestamp support
-        self.stt_pipeline = pipeline(
-            task="automatic-speech-recognition",
-            model=stt_model_name,
-            return_timestamps=True  # Enable timestamp support
-        )
-        # Initialize NER pipeline
-        self.ner_pipeline = pipeline(
-            task="ner",
-            model=ner_model_name
-        )
-    def chunk_audio(self, audio_path, chunk_duration=30):
-        """
-        Chunk long audio files into 30-second segments
-        """
-        # Load audio file
-        audio, sample_rate = librosa.load(audio_path, sr=16000)
-        # Calculate chunk size
-        chunk_samples = chunk_duration * sample_rate
-        # Create chunks
-        chunks = []
-        for start in range(0, len(audio), chunk_samples):
-            chunk = audio[start:start+chunk_samples]
-            chunks.append({
-                'array': chunk,
-                'sampling_rate': 16000
-            })
-        return chunks
     def transcribe_audio(self, audio_path):
-        """
-        Handle audio transcription for files longer than 30 seconds
-        """
-        # Check audio length
         audio, sample_rate = librosa.load(audio_path, sr=16000)
-        # If audio is longer than 30 seconds, chunk it
-        if len(audio) / sample_rate > 30:
-            audio_chunks = self.chunk_audio(audio_path)
-            transcriptions = []
-            for chunk in audio_chunks:
-                # Transcribe each chunk
-                chunk_transcription = self.stt_pipeline(chunk)
-                transcriptions.append(chunk_transcription['text'])
-            # Combine transcriptions
-            full_transcription = " ".join(transcriptions)
-        else:
-            # Process audio normally for short files
-            full_transcription = self.stt_pipeline({
-                'array': audio,
-                'sampling_rate': 16000
-            })['text']
-        return full_transcription
     def process_audio(self, audio_path):
-        # Transcribe audio
         transcription = self.transcribe_audio(audio_path)
-        # Extract named entities
         entities = self.ner_pipeline(transcription)
         return transcription, entities
-def replace_ner(entities):
-    processed_entities = []
-    for entity in entities:
-        number = int(entity['entity'].split("_")[-1])
-        # Skip entities with number 0
-        if number == 0:
-            continue
-        # Create a copy of the entity and update the label
-        updated_entity = entity.copy()
-        updated_entity['entity'] = labels[number]
-        processed_entities.append(updated_entity)
-    return processed_entities
-def process_audio_pipeline(audio):
-    """
-    Gradio interface function to process audio
-    """
-    # Initialize pipeline
-    pipeline = AudioSpeechNERPipeline()
-    try:
-        # Process the audio
-        transcription, entities = pipeline.process_audio(audio)
-        entities = replace_ner(entities)
-        return transcription, entities
     except Exception as e:
-        return f"Error processing audio: {str(e)}", ""

+import torch
 import librosa
+import noisereduce as nr
+import numpy as np
+from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer
 class AudioSpeechNERPipeline:
+    def __init__(
+        self,
+        stt_model_name='abduaziz/whisper-small-uzbek',
+        ner_model_name='abduaziz/roberta-ner-uzbek',
+        stt_language='uz',
+        chunk_duration=30
+    ):
+        # Use lazy loading for pipelines
+        self.stt_pipeline = None
+        self.ner_pipeline = None
+        self.stt_model_name = stt_model_name
+        self.ner_model_name = ner_model_name
+        self.chunk_duration = chunk_duration
+    def load_whisper_model(self, model_name='abduaziz/whisper-small-uzbek'):
+        try:
+            # Load processor
+            processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Uzbek", task="transcribe")
+            # Load model
+            model = WhisperForConditionalGeneration.from_pretrained(model_name)
+            return model, processor
+        except Exception as e:
+            print(f"Error loading Whisper model: {e}")
+            raise
+    def _load_pipelines(self):
+        """Lazy load pipelines only when needed"""
+        if self.stt_pipeline is None:
+            # Load Whisper model and processor explicitly
+            model, processor = self.load_whisper_model(self.stt_model_name)
+            tokenizer = AutoTokenizer.from_pretrained('abduaziz/whisper-small-uzbek')
+            self.stt_pipeline = pipeline(
+                "automatic-speech-recognition",
+                model=model,
+                processor=processor,
+                feature_extractor = processor.feature_extractor,
+                tokenizer=tokenizer,
+                return_timestamps=True
+            )
+        if self.ner_pipeline is None:
+            self.ner_pipeline = pipeline(
+                task="ner",
+                model=self.ner_model_name
+            )
+    def chunk_audio(self, audio, sample_rate):
+        """More efficient audio chunking"""
+        chunk_samples = self.chunk_duration * sample_rate
+        return [
+            {'array': audio[start:start+chunk_samples], 'sampling_rate': sample_rate}
+            for start in range(0, len(audio), chunk_samples)
+        ]
     def transcribe_audio(self, audio_path):
+        """Enhanced audio transcription with better error handling"""
+        self._load_pipelines()
         audio, sample_rate = librosa.load(audio_path, sr=16000)
+        preprocessed_audio = preprocess_audio(audio, sr=sample_rate)
+        if preprocessed_audio is None:
+            raise ValueError("Audio preprocessing failed")
+        if len(preprocessed_audio) / sample_rate > self.chunk_duration:
+            chunks = self.chunk_audio(preprocessed_audio, sample_rate)
+            transcriptions = [
+                self.stt_pipeline(chunk)['text'] for chunk in chunks
+            ]
+            return " ".join(transcriptions)
+        return self.stt_pipeline({
+            'array': preprocessed_audio,
+            'sampling_rate': sample_rate
+        })['text']
     def process_audio(self, audio_path):
+        """Streamlined audio processing"""
         transcription = self.transcribe_audio(audio_path)
+        self._load_pipelines()
         entities = self.ner_pipeline(transcription)
         return transcription, entities
+def preprocess_audio(audio_array, sr=16000):
+    """Improved audio preprocessing with better type handling"""
+    try:
+        # Handle tensor or numpy array input
+        if isinstance(audio_array, torch.Tensor):
+            audio_array = audio_array.numpy()
+        # Convert stereo to mono
+        if audio_array.ndim > 1:
+            audio_array = audio_array.mean(axis=0)
+        # Noise reduction and normalization
+        noise_reduced = nr.reduce_noise(
+            y=audio_array,
+            sr=sr,
+            prop_decrease=0.5,
+            n_std_thresh_stationary=1.5
+        )
+        normalized_audio = librosa.util.normalize(noise_reduced)
+        trimmed_audio, _ = librosa.effects.trim(normalized_audio, top_db=25)
+        return trimmed_audio.astype(np.float32)
     except Exception as e:
+        print(f"Audio preprocessing error: {e}")
+        return None

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ accelerate
 soundfile
 librosa
 gradio
-huggingface_hub

 soundfile
 librosa
 gradio
+huggingface_hub
+noisereduce