File size: 4,924 Bytes
50d045b 49da3ad 50d045b 676724b 1b1ae58 cde2415 1b1ae58 676724b 49da3ad 90d2625 25f5f60 90d2625 49da3ad 676724b 662ba0a 676724b 662ba0a 676724b 662ba0a 676724b 662ba0a 676724b 662ba0a 676724b 662ba0a 676724b 662ba0a 676724b 49da3ad 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 662ba0a 02d90c0 49da3ad 676724b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
---
license: apache-2.0
language:
- bn
base_model:
- openai/whisper-small
pipeline_tag: automatic-speech-recognition
---
BengaliRegionalASR trained on bengali regional dialact dataset. [sha1779/Bengali_Regional_dataset](https://huggingface.co/datasets/sha1779/Bengali_Regional_dataset)
This model is trained on this barishal regional data only. The dataset is taken from [ভাষা-বিচিত্রা: ASR for Regional Dialects](https://www.kaggle.com/competitions/ben10) competition.
# Try the model
```bash
!pip install librosa torch torchaudio transformers
```
```py
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
# MP3 URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"
# Download the MP3 file
print("Downloading audio file...")
response = requests.get(mp3_url)
if response.status_code == 200:
with open(local_audio_path, 'wb') as f:
f.write(response.content)
print("Download complete.")
else:
raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")
# Load and preprocess the audio
try:
print("Processing audio file...")
speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features
# Generate transcription
print("Generating transcription...")
predicted_ids = model.generate(inputs=input_features.to(device))[0]
transcription = processor.decode(predicted_ids, skip_special_tokens=True)
# Print the transcription
print("Transcription:", transcription)
finally:
# Clean up: delete the temporary audio file
if os.path.exists(local_audio_path):
os.remove(local_audio_path)
print("Temporary audio file deleted.")
```
## For larger audio , more than 30s
```py
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
# Remote MP3 file URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"
# Download the MP3 file
response = requests.get(mp3_url)
if response.status_code == 200:
with open(local_audio_path, 'wb') as f:
f.write(response.content)
else:
raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")
# Load audio
speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
# Define chunk parameters
chunk_duration = 30 # seconds
overlap = 5 # seconds
chunk_size = int(chunk_duration * sampling_rate)
overlap_size = int(overlap * sampling_rate)
# Split audio into chunks
chunks = [
speech_array[start : start + chunk_size]
for start in range(0, len(speech_array), chunk_size - overlap_size)
]
# Process and transcribe each chunk
transcriptions = []
for i, chunk in enumerate(chunks):
# Resample and extract features
chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features
# Generate transcription
predicted_ids = model.generate(inputs=input_features.to(device))[0]
transcription = processor.decode(predicted_ids, skip_special_tokens=True)
transcriptions.append(transcription)
# Combine and print the transcriptions
print(" ".join(transcriptions))
# Clean up temporary file
os.remove(local_audio_path)
```
# Evaluation
Word Error Rate 0.65 %
|