sha1779 commited on
Commit
662ba0a
1 Parent(s): 25f5f60

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +68 -25
README.md CHANGED
@@ -21,10 +21,13 @@ This model is trained on this barishal regional data only. The dataset is taken
21
 
22
  ```py
23
  import os
 
24
  import librosa
25
- import torch, torchaudio
26
  import numpy as np
27
- from transformers import WhisperTokenizer ,WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
 
 
28
  model_path_ = "sha1779/BengaliRegionalASR"
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
  feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
@@ -33,28 +36,53 @@ processor = WhisperProcessor.from_pretrained(model_path_)
33
  model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
34
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
35
 
36
- mp3_path = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
37
-
38
- speech_array, sampling_rate = librosa.load(mp3_path, sr=16000)
39
-
40
- speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
41
- input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- predicted_ids = model.generate(inputs=input_features.to(device))[0]
 
 
 
44
 
45
- transcription = processor.decode(predicted_ids, skip_special_tokens=True)
 
46
 
47
- print(transcription)
 
 
 
 
48
 
49
  ```
50
 
51
  ## For larger audio , more than 30s
52
  ```py
53
  import os
 
54
  import librosa
55
- import torch, torchaudio
56
  import numpy as np
57
- from transformers import WhisperTokenizer ,WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
 
 
58
  model_path_ = "sha1779/BengaliRegionalASR"
59
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
60
  feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
@@ -63,35 +91,50 @@ processor = WhisperProcessor.from_pretrained(model_path_)
63
  model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
64
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
65
 
66
- mp3_path = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
 
 
67
 
 
 
 
 
 
 
 
68
 
69
- speech_array, sampling_rate = librosa.load(mp3_path, sr=16000)
 
70
 
71
- # Split audio into 30-second chunks with 5-second overlap
72
  chunk_duration = 30 # seconds
73
  overlap = 5 # seconds
74
  chunk_size = int(chunk_duration * sampling_rate)
75
  overlap_size = int(overlap * sampling_rate)
76
 
77
- chunks = []
78
- for start in range(0, len(speech_array), chunk_size - overlap_size):
79
- end = start + chunk_size
80
- chunk = speech_array[start:end]
81
- chunks.append(chunk)
82
 
83
- # Process each chunk
84
  transcriptions = []
85
  for i, chunk in enumerate(chunks):
86
-
87
  # Resample and extract features
88
  chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
89
  input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features
90
-
91
  # Generate transcription
92
  predicted_ids = model.generate(inputs=input_features.to(device))[0]
93
  transcription = processor.decode(predicted_ids, skip_special_tokens=True)
94
- print(transcription,end=" ")
 
 
 
 
 
 
95
 
96
  ```
97
 
 
21
 
22
  ```py
23
  import os
24
+ import requests
25
  import librosa
26
+ import torch
27
  import numpy as np
28
+ from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
29
+
30
+ # Define model and device
31
  model_path_ = "sha1779/BengaliRegionalASR"
32
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
33
  feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
 
36
  model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
37
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
38
 
39
+ # MP3 URL
40
+ mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
41
+ local_audio_path = "temp_audio.wav"
42
+
43
+ # Download the MP3 file
44
+ print("Downloading audio file...")
45
+ response = requests.get(mp3_url)
46
+ if response.status_code == 200:
47
+ with open(local_audio_path, 'wb') as f:
48
+ f.write(response.content)
49
+ print("Download complete.")
50
+ else:
51
+ raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")
52
+
53
+ # Load and preprocess the audio
54
+ try:
55
+ print("Processing audio file...")
56
+ speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
57
+ speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
58
+ input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features
59
 
60
+ # Generate transcription
61
+ print("Generating transcription...")
62
+ predicted_ids = model.generate(inputs=input_features.to(device))[0]
63
+ transcription = processor.decode(predicted_ids, skip_special_tokens=True)
64
 
65
+ # Print the transcription
66
+ print("Transcription:", transcription)
67
 
68
+ finally:
69
+ # Clean up: delete the temporary audio file
70
+ if os.path.exists(local_audio_path):
71
+ os.remove(local_audio_path)
72
+ print("Temporary audio file deleted.")
73
 
74
  ```
75
 
76
  ## For larger audio , more than 30s
77
  ```py
78
  import os
79
+ import requests
80
  import librosa
81
+ import torch
82
  import numpy as np
83
+ from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
84
+
85
+ # Define model and device
86
  model_path_ = "sha1779/BengaliRegionalASR"
87
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
88
  feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
 
91
  model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
92
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
93
 
94
+ # Remote MP3 file URL
95
+ mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
96
+ local_audio_path = "temp_audio.wav"
97
 
98
+ # Download the MP3 file
99
+ response = requests.get(mp3_url)
100
+ if response.status_code == 200:
101
+ with open(local_audio_path, 'wb') as f:
102
+ f.write(response.content)
103
+ else:
104
+ raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")
105
 
106
+ # Load audio
107
+ speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
108
 
109
+ # Define chunk parameters
110
  chunk_duration = 30 # seconds
111
  overlap = 5 # seconds
112
  chunk_size = int(chunk_duration * sampling_rate)
113
  overlap_size = int(overlap * sampling_rate)
114
 
115
+ # Split audio into chunks
116
+ chunks = [
117
+ speech_array[start : start + chunk_size]
118
+ for start in range(0, len(speech_array), chunk_size - overlap_size)
119
+ ]
120
 
121
+ # Process and transcribe each chunk
122
  transcriptions = []
123
  for i, chunk in enumerate(chunks):
 
124
  # Resample and extract features
125
  chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
126
  input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features
127
+
128
  # Generate transcription
129
  predicted_ids = model.generate(inputs=input_features.to(device))[0]
130
  transcription = processor.decode(predicted_ids, skip_special_tokens=True)
131
+ transcriptions.append(transcription)
132
+
133
+ # Combine and print the transcriptions
134
+ print(" ".join(transcriptions))
135
+
136
+ # Clean up temporary file
137
+ os.remove(local_audio_path)
138
 
139
  ```
140