abduaziz commited on
Commit
d4fb41c
1 Parent(s): 70d6a1c

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. .gradio/certificate.pem +31 -0
  3. app.py +99 -11
  4. pipe.py +104 -106
  5. requirements.txt +2 -1
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv
2
+ __pycache__
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -1,28 +1,116 @@
1
- import gradio as gr
2
  import os
3
- from pipe import process_audio_pipeline, AudioSpeechNERPipeline
4
  from huggingface_hub import login
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def create_gradio_interface():
7
- # Create Gradio interface
8
  iface = gr.Interface(
9
  fn=process_audio_pipeline,
10
- inputs=gr.Audio(type="filepath", label="Upload Audio"),
11
  outputs=[
12
  gr.Textbox(label="Transcription"),
13
- gr.Textbox(label="Named Entities")
14
  ],
15
- title="Uzbek Speech Recognition and Named Entity Recognition",
16
- description="Upload an Uzbek audio file (MP3 or WAV) to transcribe and extract named entities."
 
 
 
 
 
17
  )
18
  return iface
19
 
20
  def main():
21
- # Create and launch the Gradio interface
22
  demo = create_gradio_interface()
23
- demo.launch(share=True)
24
 
25
  if __name__ == "__main__":
26
- login(token=os.getenv('HF_TOKEN'), new_session=False)
27
- AudioSpeechNERPipeline()
 
 
 
28
  main()
 
 
1
  import os
2
+ import gradio as gr
3
  from huggingface_hub import login
4
+ from pipe import AudioSpeechNERPipeline
5
+ import html
6
+
7
+ # Optimized Labels Dictionary
8
+ LABELS = {
9
+ 0: 'O', 1: 'B-DATE', 2: 'B-EVENT', 3: 'B-LOC',
10
+ 4: 'B-ORG', 5: 'B-PER', 6: 'I-DATE', 7: 'I-EVENT',
11
+ 8: 'I-LOC', 9: 'I-ORG', 10: 'I-PER'
12
+ }
13
+
14
+ def process_audio_pipeline(audio):
15
+ """Robust Gradio processing function"""
16
+ pipeline = AudioSpeechNERPipeline()
17
+
18
+ try:
19
+ transcription, entities = pipeline.process_audio(audio)
20
+ highlighted_text = highlight_entities(transcription, entities)
21
+
22
+ return transcription, highlighted_text
23
+
24
+ except Exception as e:
25
+ return f"Error processing audio: {str(e)}", ""
26
+
27
+ def highlight_entities(transcription, entities):
28
+ """Enhanced entity highlighting with a legend."""
29
+ # Map entity labels to human-readable labels if needed
30
+ processed_entities = [
31
+ {**entity, 'label': LABELS[int(entity['entity'].split("_")[-1])]}
32
+ for entity in entities if int(entity['entity'].split("_")[-1]) != 0
33
+ ]
34
+
35
+ # Sort entities by their start position to avoid overlapping issues
36
+ processed_entities.sort(key=lambda x: x.get('start', 0))
37
+
38
+ # Escape transcription for HTML safety
39
+ transcription = html.escape(transcription)
40
+ highlighted_text = transcription
41
+ offset = 0 # Track how much the text length changes due to added HTML
42
+
43
+ # Define color coding for entity types
44
+ colors = {
45
+ 'B-PER': 'blue', 'I-PER': 'blue',
46
+ 'B-ORG': 'green', 'I-ORG': 'green',
47
+ 'B-LOC': 'red', 'I-LOC': 'red',
48
+ 'B-DATE': 'purple', 'I-DATE': 'purple',
49
+ 'B-EVENT': 'orange', 'I-EVENT': 'orange'
50
+ }
51
+
52
+ for entity in processed_entities:
53
+ start = entity.get('start', 0) + offset
54
+ end = entity.get('end', 0) + offset
55
+ label = entity['label']
56
+
57
+ color = colors.get(label, 'black')
58
+
59
+ # Wrap the entity text with a styled span
60
+ highlighted_part = (
61
+ f'<span style="background-color: {color}; color: white; '
62
+ f'padding: 2px; border-radius: 3px;">'
63
+ f'{highlighted_text[start:end]}</span>'
64
+ )
65
+
66
+ # Replace text in the highlighted_text with the HTML
67
+ highlighted_text = (
68
+ highlighted_text[:start] + highlighted_part +
69
+ highlighted_text[end:]
70
+ )
71
+
72
+ # Update offset to account for added HTML
73
+ offset += len(highlighted_part) - (end - start)
74
+
75
+ # Create a legend for the labels and their colors
76
+ legend = '<br><br><strong>Legend:</strong><br>'
77
+ legend += ''.join(
78
+ f'<span style="background-color: {color}; color: white; '
79
+ f'padding: 2px; border-radius: 3px; margin-right: 10px;">{label}</span>'
80
+ for label, color in colors.items()
81
+ )
82
+
83
+ return highlighted_text + legend
84
+
85
 
86
  def create_gradio_interface():
87
+ """Enhanced Gradio interface with improved styling"""
88
  iface = gr.Interface(
89
  fn=process_audio_pipeline,
90
+ inputs=gr.Audio(type="filepath", label="Upload Uzbek Audio"),
91
  outputs=[
92
  gr.Textbox(label="Transcription"),
93
+ gr.HTML(label="Named Entities") # Changed to HTML for highlighting
94
  ],
95
+ title="🎙️ Uzbek Speech Recognition & NER",
96
+ description=(
97
+ "Upload an Uzbek audio file to transcribe and "
98
+ "visualize named entities with color-coded highlighting. "
99
+ "Supports MP3 and WAV formats."
100
+ ),
101
+ css=".gradio-container { background-color: #f0f0f0; }"
102
  )
103
  return iface
104
 
105
  def main():
106
+ """Main execution function"""
107
  demo = create_gradio_interface()
108
+ demo.launch()
109
 
110
  if __name__ == "__main__":
111
+ # Optional: Handle HuggingFace login more securely
112
+ token = os.getenv('HF_TOKEN')
113
+ if token:
114
+ login(token=token, new_session=False)
115
+
116
  main()
pipe.py CHANGED
@@ -1,123 +1,121 @@
1
- import os
2
  import librosa
3
- from transformers import pipeline
4
-
5
- labels = {0: 'O',
6
- 1: 'B-DATE',
7
- 2: 'B-EVENT',
8
- 3: 'B-LOC',
9
- 4: 'B-ORG',
10
- 5: 'B-PER',
11
- 6: 'I-DATE',
12
- 7: 'I-EVENT',
13
- 8: 'I-LOC',
14
- 9: 'I-ORG',
15
- 10: 'I-PER'}
16
 
17
  class AudioSpeechNERPipeline:
18
- def __init__(self,
19
- stt_model_name='abduaziz/whisper-small-uz',
20
- ner_model_name='abduaziz/bert-ner-uz',
21
- stt_language='uz'):
22
- # Initialize Speech-to-Text pipeline with timestamp support
23
- self.stt_pipeline = pipeline(
24
- task="automatic-speech-recognition",
25
- model=stt_model_name,
26
- return_timestamps=True # Enable timestamp support
27
- )
28
- # Initialize NER pipeline
29
- self.ner_pipeline = pipeline(
30
- task="ner",
31
- model=ner_model_name
32
- )
33
-
34
- def chunk_audio(self, audio_path, chunk_duration=30):
35
- """
36
- Chunk long audio files into 30-second segments
37
- """
38
- # Load audio file
39
- audio, sample_rate = librosa.load(audio_path, sr=16000)
40
-
41
- # Calculate chunk size
42
- chunk_samples = chunk_duration * sample_rate
43
-
44
- # Create chunks
45
- chunks = []
46
- for start in range(0, len(audio), chunk_samples):
47
- chunk = audio[start:start+chunk_samples]
48
- chunks.append({
49
- 'array': chunk,
50
- 'sampling_rate': 16000
51
- })
52
-
53
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def transcribe_audio(self, audio_path):
56
- """
57
- Handle audio transcription for files longer than 30 seconds
58
- """
59
- # Check audio length
60
  audio, sample_rate = librosa.load(audio_path, sr=16000)
61
-
62
- # If audio is longer than 30 seconds, chunk it
63
- if len(audio) / sample_rate > 30:
64
- audio_chunks = self.chunk_audio(audio_path)
65
- transcriptions = []
66
-
67
- for chunk in audio_chunks:
68
- # Transcribe each chunk
69
- chunk_transcription = self.stt_pipeline(chunk)
70
- transcriptions.append(chunk_transcription['text'])
71
-
72
- # Combine transcriptions
73
- full_transcription = " ".join(transcriptions)
74
- else:
75
- # Process audio normally for short files
76
- full_transcription = self.stt_pipeline({
77
- 'array': audio,
78
- 'sampling_rate': 16000
79
- })['text']
80
-
81
- return full_transcription
82
 
83
  def process_audio(self, audio_path):
84
- # Transcribe audio
85
  transcription = self.transcribe_audio(audio_path)
86
-
87
- # Extract named entities
88
  entities = self.ner_pipeline(transcription)
89
-
90
  return transcription, entities
91
 
92
- def replace_ner(entities):
93
- processed_entities = []
94
-
95
- for entity in entities:
96
- number = int(entity['entity'].split("_")[-1])
 
97
 
98
- # Skip entities with number 0
99
- if number == 0:
100
- continue
101
 
102
- # Create a copy of the entity and update the label
103
- updated_entity = entity.copy()
104
- updated_entity['entity'] = labels[number]
105
- processed_entities.append(updated_entity)
106
- return processed_entities
107
-
108
- def process_audio_pipeline(audio):
109
- """
110
- Gradio interface function to process audio
111
- """
112
- # Initialize pipeline
113
- pipeline = AudioSpeechNERPipeline()
114
-
115
- try:
116
- # Process the audio
117
- transcription, entities = pipeline.process_audio(audio)
118
- entities = replace_ner(entities)
119
 
120
- return transcription, entities
 
 
 
121
 
122
  except Exception as e:
123
- return f"Error processing audio: {str(e)}", ""
 
 
1
+ import torch
2
  import librosa
3
+ import noisereduce as nr
4
+ import numpy as np
5
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
6
 
7
  class AudioSpeechNERPipeline:
8
+ def __init__(
9
+ self,
10
+ stt_model_name='abduaziz/whisper-small-uzbek',
11
+ ner_model_name='abduaziz/roberta-ner-uzbek',
12
+ stt_language='uz',
13
+ chunk_duration=30
14
+ ):
15
+ # Use lazy loading for pipelines
16
+ self.stt_pipeline = None
17
+ self.ner_pipeline = None
18
+ self.stt_model_name = stt_model_name
19
+ self.ner_model_name = ner_model_name
20
+ self.chunk_duration = chunk_duration
21
+
22
+ def load_whisper_model(self, model_name='abduaziz/whisper-small-uzbek'):
23
+ try:
24
+ # Load processor
25
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Uzbek", task="transcribe")
26
+
27
+ # Load model
28
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
29
+
30
+ return model, processor
31
+
32
+ except Exception as e:
33
+ print(f"Error loading Whisper model: {e}")
34
+ raise
35
+
36
+ def _load_pipelines(self):
37
+ """Lazy load pipelines only when needed"""
38
+ if self.stt_pipeline is None:
39
+ # Load Whisper model and processor explicitly
40
+ model, processor = self.load_whisper_model(self.stt_model_name)
41
+ tokenizer = AutoTokenizer.from_pretrained('abduaziz/whisper-small-uzbek')
42
+ self.stt_pipeline = pipeline(
43
+ "automatic-speech-recognition",
44
+ model=model,
45
+ processor=processor,
46
+ feature_extractor = processor.feature_extractor,
47
+ tokenizer=tokenizer,
48
+ return_timestamps=True
49
+ )
50
+ if self.ner_pipeline is None:
51
+ self.ner_pipeline = pipeline(
52
+ task="ner",
53
+ model=self.ner_model_name
54
+ )
55
+
56
+ def chunk_audio(self, audio, sample_rate):
57
+ """More efficient audio chunking"""
58
+ chunk_samples = self.chunk_duration * sample_rate
59
+ return [
60
+ {'array': audio[start:start+chunk_samples], 'sampling_rate': sample_rate}
61
+ for start in range(0, len(audio), chunk_samples)
62
+ ]
63
 
64
  def transcribe_audio(self, audio_path):
65
+ """Enhanced audio transcription with better error handling"""
66
+ self._load_pipelines()
67
+
 
68
  audio, sample_rate = librosa.load(audio_path, sr=16000)
69
+ preprocessed_audio = preprocess_audio(audio, sr=sample_rate)
70
+
71
+ if preprocessed_audio is None:
72
+ raise ValueError("Audio preprocessing failed")
73
+
74
+ if len(preprocessed_audio) / sample_rate > self.chunk_duration:
75
+ chunks = self.chunk_audio(preprocessed_audio, sample_rate)
76
+ transcriptions = [
77
+ self.stt_pipeline(chunk)['text'] for chunk in chunks
78
+ ]
79
+ return " ".join(transcriptions)
80
+
81
+ return self.stt_pipeline({
82
+ 'array': preprocessed_audio,
83
+ 'sampling_rate': sample_rate
84
+ })['text']
 
 
 
 
 
85
 
86
  def process_audio(self, audio_path):
87
+ """Streamlined audio processing"""
88
  transcription = self.transcribe_audio(audio_path)
89
+
90
+ self._load_pipelines()
91
  entities = self.ner_pipeline(transcription)
92
+
93
  return transcription, entities
94
 
95
+ def preprocess_audio(audio_array, sr=16000):
96
+ """Improved audio preprocessing with better type handling"""
97
+ try:
98
+ # Handle tensor or numpy array input
99
+ if isinstance(audio_array, torch.Tensor):
100
+ audio_array = audio_array.numpy()
101
 
102
+ # Convert stereo to mono
103
+ if audio_array.ndim > 1:
104
+ audio_array = audio_array.mean(axis=0)
105
 
106
+ # Noise reduction and normalization
107
+ noise_reduced = nr.reduce_noise(
108
+ y=audio_array,
109
+ sr=sr,
110
+ prop_decrease=0.5,
111
+ n_std_thresh_stationary=1.5
112
+ )
 
 
 
 
 
 
 
 
 
 
113
 
114
+ normalized_audio = librosa.util.normalize(noise_reduced)
115
+ trimmed_audio, _ = librosa.effects.trim(normalized_audio, top_db=25)
116
+
117
+ return trimmed_audio.astype(np.float32)
118
 
119
  except Exception as e:
120
+ print(f"Audio preprocessing error: {e}")
121
+ return None
requirements.txt CHANGED
@@ -4,4 +4,5 @@ accelerate
4
  soundfile
5
  librosa
6
  gradio
7
- huggingface_hub
 
 
4
  soundfile
5
  librosa
6
  gradio
7
+ huggingface_hub
8
+ noisereduce