sha1779 commited on
Commit
02d90c0
1 Parent(s): 3a70428

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +47 -0
README.md CHANGED
@@ -40,6 +40,53 @@ print(transcription)
40
 
41
  ```
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Evaluation
44
  Word Error Rate 0.65 %
45
 
 
40
 
41
  ```
42
 
43
+ ## For larger audio , more than 30s
44
+ ```py
45
+ import os
46
+ import librosa
47
+ import torch, torchaudio
48
+ import numpy as np
49
+ from transformers import WhisperTokenizer ,WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
50
+ model_path_ = "sha1779/BengaliRegionalASR"
51
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
52
+ feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
53
+ tokenizer = WhisperTokenizer.from_pretrained(model_path_)
54
+ processor = WhisperProcessor.from_pretrained(model_path_)
55
+ model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
56
+ model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
57
+
58
+ mp3_path = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
59
+
60
+
61
+ speech_array, sampling_rate = librosa.load(mp3_path, sr=16000)
62
+
63
+ # Split audio into 30-second chunks with 5-second overlap
64
+ chunk_duration = 30 # seconds
65
+ overlap = 5 # seconds
66
+ chunk_size = int(chunk_duration * sampling_rate)
67
+ overlap_size = int(overlap * sampling_rate)
68
+
69
+ chunks = []
70
+ for start in range(0, len(speech_array), chunk_size - overlap_size):
71
+ end = start + chunk_size
72
+ chunk = speech_array[start:end]
73
+ chunks.append(chunk)
74
+
75
+ # Process each chunk
76
+ transcriptions = []
77
+ for i, chunk in enumerate(chunks):
78
+
79
+ # Resample and extract features
80
+ chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
81
+ input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features
82
+
83
+ # Generate transcription
84
+ predicted_ids = model.generate(inputs=input_features.to(device))[0]
85
+ transcription = processor.decode(predicted_ids, skip_special_tokens=True)
86
+ print(transcription,end=" ")
87
+
88
+ ```
89
+
90
  # Evaluation
91
  Word Error Rate 0.65 %
92