File size: 4,924 Bytes
50d045b
 
 
 
 
 
49da3ad
50d045b
676724b
 
1b1ae58
cde2415
 
1b1ae58
676724b
49da3ad
90d2625
 
25f5f60
90d2625
 
49da3ad
676724b
662ba0a
676724b
662ba0a
676724b
662ba0a
 
 
676724b
 
 
 
 
 
 
 
662ba0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676724b
662ba0a
 
 
 
676724b
662ba0a
 
676724b
662ba0a
 
 
 
 
676724b
49da3ad
 
02d90c0
 
 
662ba0a
02d90c0
662ba0a
02d90c0
662ba0a
 
 
02d90c0
 
 
 
 
 
 
 
662ba0a
 
 
02d90c0
662ba0a
 
 
 
 
 
 
02d90c0
662ba0a
 
02d90c0
662ba0a
02d90c0
 
 
 
 
662ba0a
 
 
 
 
02d90c0
662ba0a
02d90c0
 
 
 
 
662ba0a
02d90c0
 
 
662ba0a
 
 
 
 
 
 
02d90c0
 
 
49da3ad
 
 
 
676724b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
---
license: apache-2.0
language:
- bn
base_model:
- openai/whisper-small

pipeline_tag: automatic-speech-recognition

---
BengaliRegionalASR trained on bengali regional dialact dataset. [sha1779/Bengali_Regional_dataset](https://huggingface.co/datasets/sha1779/Bengali_Regional_dataset)


This model is trained on this barishal regional data only. The dataset is taken from [ভাষা-বিচিত্রা: ASR for Regional Dialects](https://www.kaggle.com/competitions/ben10) competition.

# Try the model

```bash
!pip install librosa torch torchaudio transformers
```

```py
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration

# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")

# MP3 URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"

# Download the MP3 file
print("Downloading audio file...")
response = requests.get(mp3_url)
if response.status_code == 200:
    with open(local_audio_path, 'wb') as f:
        f.write(response.content)
    print("Download complete.")
else:
    raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")

# Load and preprocess the audio
try:
    print("Processing audio file...")
    speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
    input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features

    # Generate transcription
    print("Generating transcription...")
    predicted_ids = model.generate(inputs=input_features.to(device))[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)

    # Print the transcription
    print("Transcription:", transcription)

finally:
    # Clean up: delete the temporary audio file
    if os.path.exists(local_audio_path):
        os.remove(local_audio_path)
        print("Temporary audio file deleted.")

```

## For larger audio , more than 30s
```py
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration

# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")

# Remote MP3 file URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"

# Download the MP3 file
response = requests.get(mp3_url)
if response.status_code == 200:
    with open(local_audio_path, 'wb') as f:
        f.write(response.content)
else:
    raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")

# Load audio
speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)

# Define chunk parameters
chunk_duration = 30  # seconds
overlap = 5  # seconds
chunk_size = int(chunk_duration * sampling_rate)
overlap_size = int(overlap * sampling_rate)

# Split audio into chunks
chunks = [
    speech_array[start : start + chunk_size]
    for start in range(0, len(speech_array), chunk_size - overlap_size)
]

# Process and transcribe each chunk
transcriptions = []
for i, chunk in enumerate(chunks):
    # Resample and extract features
    chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
    input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features

    # Generate transcription
    predicted_ids = model.generate(inputs=input_features.to(device))[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    transcriptions.append(transcription)

# Combine and print the transcriptions
print(" ".join(transcriptions))

# Clean up temporary file
os.remove(local_audio_path)

``` 

# Evaluation
Word Error Rate 0.65 %