BengaliRegionalASR / README.md
sha1779's picture
Update README.md
662ba0a verified
metadata
license: apache-2.0
language:
  - bn
base_model:
  - openai/whisper-small
pipeline_tag: automatic-speech-recognition

BengaliRegionalASR trained on bengali regional dialact dataset. sha1779/Bengali_Regional_dataset

This model is trained on this barishal regional data only. The dataset is taken from ভাষা-বিচিত্রা: ASR for Regional Dialects competition.

Try the model

!pip install librosa torch torchaudio transformers
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration

# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")

# MP3 URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"

# Download the MP3 file
print("Downloading audio file...")
response = requests.get(mp3_url)
if response.status_code == 200:
    with open(local_audio_path, 'wb') as f:
        f.write(response.content)
    print("Download complete.")
else:
    raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")

# Load and preprocess the audio
try:
    print("Processing audio file...")
    speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
    input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features

    # Generate transcription
    print("Generating transcription...")
    predicted_ids = model.generate(inputs=input_features.to(device))[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)

    # Print the transcription
    print("Transcription:", transcription)

finally:
    # Clean up: delete the temporary audio file
    if os.path.exists(local_audio_path):
        os.remove(local_audio_path)
        print("Temporary audio file deleted.")

For larger audio , more than 30s

import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration

# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")

# Remote MP3 file URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"

# Download the MP3 file
response = requests.get(mp3_url)
if response.status_code == 200:
    with open(local_audio_path, 'wb') as f:
        f.write(response.content)
else:
    raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")

# Load audio
speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)

# Define chunk parameters
chunk_duration = 30  # seconds
overlap = 5  # seconds
chunk_size = int(chunk_duration * sampling_rate)
overlap_size = int(overlap * sampling_rate)

# Split audio into chunks
chunks = [
    speech_array[start : start + chunk_size]
    for start in range(0, len(speech_array), chunk_size - overlap_size)
]

# Process and transcribe each chunk
transcriptions = []
for i, chunk in enumerate(chunks):
    # Resample and extract features
    chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
    input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features

    # Generate transcription
    predicted_ids = model.generate(inputs=input_features.to(device))[0]
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)
    transcriptions.append(transcription)

# Combine and print the transcriptions
print(" ".join(transcriptions))

# Clean up temporary file
os.remove(local_audio_path)

Evaluation

Word Error Rate 0.65 %