Spaces:
Running
Running
File size: 5,906 Bytes
5b106c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
abs_path = os.path.abspath('.')
base_dir = os.path.dirname(os.path.dirname(abs_path))
os.environ['TRANSFORMERS_CACHE'] = os.path.join(base_dir, 'models_cache')
import torch
# Details: https://huggingface.co/docs/diffusers/optimization/fp16#enable-cudnn-autotuner
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor, AutoConfig, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperFeatureExtractor
from typing import Union, BinaryIO
# from optimum.bettertransformer import BetterTransformer
language = '<|bn|>'
# language = '<|en|>'
task = "transcribe" # transcribe or translate
# model_name = 'openai/whisper-tiny.en'
# model_name = 'openai/whisper-base.en'
# model_name = 'openai/whisper-small.en'
# model_name = 'openai/whisper-medium'
## v2: trained on more epochs with regularization
# model_name = 'openai/whisper-large-v2'
## bangla
# model_name = 'Rakib/whisper-tiny-bn'
#model_name = 'anuragshas/whisper-small-bn'
# model_name = 'anuragshas/whisper-large-v2-bn'
# model_name = "Rakib/whisper-small-bn"
# model_name = "Rakib/whisper-small-bn-all"
# model_name = "Rakib/whisper-small-bn-all-600"
# model_name = "Rakib/whisper-small-bn-all-600-v2"
model_name = "Rakib/whisper-small-bn-crblp"
## lets you know the device count: cuda:0 or cuda:1
# print(torch.cuda.device_count())
device = 0 if torch.cuda.is_available() else -1
# device = -1 #Exclusively CPU
print(f"Using device: {'GPU' if device==0 else 'CPU'}")
if device !=0:
print("[Warning!] Using CPU could hamper performance")
print("Loading Tokenizer for ASR Speech-to-Text Model...\n" + "*" * 100)
# tokenizer = AutoTokenizer.from_pretrained(model_name, language=language, task=task)
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer = WhisperTokenizer.from_pretrained(model_name)
# tokenizer(['�', '�্র'],add_prefix_space=True, add_special_tokens=False).input_ids
print("Loading Feature Extractor for ASR Speech-to-Text Model...\n" + "*" * 100)
# feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
print("Loading Config for ASR Speech-to-Text Model...\n" + "*" * 100)
config = AutoConfig.from_pretrained(model_name)
print("Loading Processor for ASR Speech-to-Text Model...\n" + "*" * 100)
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
print("Loading WHISPER ASR Speech-to-Text Model...\n" + "*" * 100)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
## BetterTransformer (No Need if PyTorch 2.0 works!!)
## (currently 2secs faster inference than PyTorch 2.0 )
# model = WhisperForConditionalGeneration.from_pretrained(model_name)
# model = BetterTransformer.transform(model)
## bitsandbytes (only Linux & GPU) (requires conda env with conda-based pytorch!!!)
## currently only reduces size. slower inference than native models!!!
## from_pretrained doc: https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained
# model = WhisperForConditionalGeneration.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
## For PyTorch 2.0 (Only Linux)
# model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device="cuda:0")
##mode options are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes
# model = torch.compile(model, mode="default")
asr = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
# processor=processor, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py
# config=config, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py
device=device, # for gpu 1 for cpu -1
## chunk files longer than 30s into shorted samples
chunk_length_s=30,
## the amount of overlap (in secs) to be discarded while stitching the inferenced chunks
## stride_length_s is a tuple of the left and right stride(overlap) length.
## With only 1 number, both sides get the same stride, by default
## The stride_length on one side is 1/6th of the chunk_length_s if stride_length no provided
# stride_length_s=[8, 8],
stride_length_s=[5, 5],
# stride_length_s=[6,0],
batch_size=16,
ignore_warning=True,
## force whisper to generate timestamps so that the chunking and stitching can be accurate
# return_timestamps=True,
generate_kwargs = {
'language':language,
'task':task,
'repetition_penalty':1.8,
'num_beams':2,
'max_new_tokens':448,
'early_stopping':True,
# 'renormalize_logits':True,
# [16867]: �, [16867, 156, 100, 235, 156, 12811]: �্র
'bad_words_ids':[[16867], [16867, 156, 100, 235, 156, 12811]],
# 'supress_tokens': [16867, 156, 100, 235, 156, 12811],
}
)
def transcribe(speech_array: Union[str, BinaryIO], language: str = "en") -> str:
"""
Transcribes an audio array to text
Args:
speech_array (np.ndarray): audio in numpy array format
language (str): "sv" or "en"
Returns:
a string containing transcription
"""
asr.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
# asr.model.config.max_new_tokens = 448 #default is 448
result = asr(speech_array)
return str(result["text"]) |