File size: 4,924 Bytes

50d045b
 
 
 
 
 
49da3ad
50d045b
676724b
 
1b1ae58
cde2415
 
1b1ae58
676724b
49da3ad
90d2625
 
25f5f60
90d2625
 
49da3ad
676724b
662ba0a
676724b
662ba0a
676724b
662ba0a
 
 
676724b
 
 
 
 
 
 
 
662ba0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676724b
662ba0a
 
 
 
676724b
662ba0a
 
676724b
662ba0a
 
 
 
 
676724b
49da3ad
 
02d90c0
 
 
662ba0a
02d90c0
662ba0a
02d90c0
662ba0a
 
 
02d90c0
 
 
 
 
 
 
 
662ba0a
 
 
02d90c0
662ba0a
 
 
 
 
 
 
02d90c0
662ba0a
 
02d90c0
662ba0a
02d90c0
 
 
 
 
662ba0a
 
 
 
 
02d90c0
662ba0a
02d90c0
 
 
 
 
662ba0a
02d90c0
 
 
662ba0a
 
 
 
 
 
 
02d90c0
 
 
49da3ad
 
 
 
676724b

---
license: apache-2.0
language:
- bn
base_model:
- openai/whisper-small

pipeline_tag: automatic-speech-recognition

---
BengaliRegionalASR trained on bengali regional dialact dataset. [sha1779/Bengali_Regional_dataset](https://huggingface.co/datasets/sha1779/Bengali_Regional_dataset)


This model is trained on this barishal regional data only. The dataset is taken from [ভাষা-বিচিত্রা: ASR for Regional Dialects](https://www.kaggle.com/competitions/ben10) competition.

# Try the model

```bash
!pip install librosa torch torchaudio transformers
```

```py
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration

# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")

# MP3 URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"

# Download the MP3 file
print("Downloading audio file...")
response = requests.get(mp3_url)
if response.status_code == 200:
    with open(local_audio_path, 'wb') as f:
        f.write(response.content)
    print("Download complete.")
else:
    raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")

# Load and preprocess the audio
try:
    print("Processing audio file...")
    speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
    input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features

    # Generate transcription
    print("Generating transcription...")
    predicted_ids = model.generate(inputs=input_features.to(device))[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)

    # Print the transcription
    print("Transcription:", transcription)

finally:
    # Clean up: delete the temporary audio file
    if os.path.exists(local_audio_path):
        os.remove(local_audio_path)
        print("Temporary audio file deleted.")

```

## For larger audio , more than 30s
```py
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration

# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")

# Remote MP3 file URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"

# Download the MP3 file
response = requests.get(mp3_url)
if response.status_code == 200:
    with open(local_audio_path, 'wb') as f:
        f.write(response.content)
else:
    raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")

# Load audio
speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)

# Define chunk parameters
chunk_duration = 30  # seconds
overlap = 5  # seconds
chunk_size = int(chunk_duration * sampling_rate)
overlap_size = int(overlap * sampling_rate)

# Split audio into chunks
chunks = [
    speech_array[start : start + chunk_size]
    for start in range(0, len(speech_array), chunk_size - overlap_size)
]

# Process and transcribe each chunk
transcriptions = []
for i, chunk in enumerate(chunks):
    # Resample and extract features
    chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
    input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features

    # Generate transcription
    predicted_ids = model.generate(inputs=input_features.to(device))[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    transcriptions.append(transcription)

# Combine and print the transcriptions
print(" ".join(transcriptions))

# Clean up temporary file
os.remove(local_audio_path)

``` 

# Evaluation
Word Error Rate 0.65 %