Update README.md
Browse files
README.md
CHANGED
@@ -40,6 +40,53 @@ print(transcription)
|
|
40 |
|
41 |
```
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# Evaluation
|
44 |
Word Error Rate 0.65 %
|
45 |
|
|
|
40 |
|
41 |
```
|
42 |
|
43 |
+
## For larger audio , more than 30s
|
44 |
+
```py
|
45 |
+
import os
|
46 |
+
import librosa
|
47 |
+
import torch, torchaudio
|
48 |
+
import numpy as np
|
49 |
+
from transformers import WhisperTokenizer ,WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
|
50 |
+
model_path_ = "sha1779/BengaliRegionalASR"
|
51 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
52 |
+
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
|
53 |
+
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
|
54 |
+
processor = WhisperProcessor.from_pretrained(model_path_)
|
55 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
|
56 |
+
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
|
57 |
+
|
58 |
+
mp3_path = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
|
59 |
+
|
60 |
+
|
61 |
+
speech_array, sampling_rate = librosa.load(mp3_path, sr=16000)
|
62 |
+
|
63 |
+
# Split audio into 30-second chunks with 5-second overlap
|
64 |
+
chunk_duration = 30 # seconds
|
65 |
+
overlap = 5 # seconds
|
66 |
+
chunk_size = int(chunk_duration * sampling_rate)
|
67 |
+
overlap_size = int(overlap * sampling_rate)
|
68 |
+
|
69 |
+
chunks = []
|
70 |
+
for start in range(0, len(speech_array), chunk_size - overlap_size):
|
71 |
+
end = start + chunk_size
|
72 |
+
chunk = speech_array[start:end]
|
73 |
+
chunks.append(chunk)
|
74 |
+
|
75 |
+
# Process each chunk
|
76 |
+
transcriptions = []
|
77 |
+
for i, chunk in enumerate(chunks):
|
78 |
+
|
79 |
+
# Resample and extract features
|
80 |
+
chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
|
81 |
+
input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features
|
82 |
+
|
83 |
+
# Generate transcription
|
84 |
+
predicted_ids = model.generate(inputs=input_features.to(device))[0]
|
85 |
+
transcription = processor.decode(predicted_ids, skip_special_tokens=True)
|
86 |
+
print(transcription,end=" ")
|
87 |
+
|
88 |
+
```
|
89 |
+
|
90 |
# Evaluation
|
91 |
Word Error Rate 0.65 %
|
92 |
|