Spaces:

Rakib
/

Bangla_ASR_Demo

Running

File size: 5,906 Bytes

5b106c5

import os

abs_path = os.path.abspath('.')
base_dir = os.path.dirname(os.path.dirname(abs_path))
os.environ['TRANSFORMERS_CACHE'] = os.path.join(base_dir, 'models_cache')

import torch
# Details: https://huggingface.co/docs/diffusers/optimization/fp16#enable-cudnn-autotuner
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor, AutoConfig, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperFeatureExtractor
from typing import Union, BinaryIO
# from optimum.bettertransformer import BetterTransformer

language = '<|bn|>'
# language = '<|en|>'
task = "transcribe"  # transcribe or translate

# model_name = 'openai/whisper-tiny.en'
# model_name = 'openai/whisper-base.en'
# model_name = 'openai/whisper-small.en'
# model_name = 'openai/whisper-medium' 
## v2: trained on more epochs with regularization
# model_name = 'openai/whisper-large-v2' 

## bangla
# model_name = 'Rakib/whisper-tiny-bn' 
#model_name = 'anuragshas/whisper-small-bn' 
# model_name = 'anuragshas/whisper-large-v2-bn'
# model_name = "Rakib/whisper-small-bn"
# model_name = "Rakib/whisper-small-bn-all"
# model_name = "Rakib/whisper-small-bn-all-600"
# model_name = "Rakib/whisper-small-bn-all-600-v2"
model_name = "Rakib/whisper-small-bn-crblp"

## lets you know the device count: cuda:0 or cuda:1
# print(torch.cuda.device_count())

device = 0 if torch.cuda.is_available() else -1
# device = -1 #Exclusively CPU

print(f"Using device: {'GPU' if device==0 else 'CPU'}")

if device !=0:
    print("[Warning!] Using CPU could hamper performance")

print("Loading Tokenizer for ASR Speech-to-Text Model...\n" + "*" * 100)
# tokenizer = AutoTokenizer.from_pretrained(model_name, language=language, task=task)
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer = WhisperTokenizer.from_pretrained(model_name)
# tokenizer(['�', '�্র'],add_prefix_space=True, add_special_tokens=False).input_ids

print("Loading Feature Extractor for ASR Speech-to-Text Model...\n" + "*" * 100)
# feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

print("Loading Config for ASR Speech-to-Text Model...\n" + "*" * 100)
config = AutoConfig.from_pretrained(model_name)

print("Loading Processor for ASR Speech-to-Text Model...\n" + "*" * 100)
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

print("Loading WHISPER ASR Speech-to-Text Model...\n" + "*" * 100)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

## BetterTransformer (No Need if PyTorch 2.0 works!!) 
## (currently 2secs faster inference than PyTorch 2.0 )
# model = WhisperForConditionalGeneration.from_pretrained(model_name)
# model = BetterTransformer.transform(model)

## bitsandbytes (only Linux & GPU) (requires conda env with conda-based pytorch!!!)
## currently only reduces size. slower inference than native models!!!
## from_pretrained doc: https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained
# model = WhisperForConditionalGeneration.from_pretrained(model_name, device_map="auto", load_in_8bit=True)

## For PyTorch 2.0 (Only Linux)
# model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device="cuda:0")
##mode options are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes
# model = torch.compile(model, mode="default") 


asr = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    # processor=processor, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py
    # config=config, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py
    device=device,  # for gpu 1 for cpu -1
    ## chunk files longer than 30s into shorted samples
    chunk_length_s=30, 
    ## the amount of overlap (in secs) to be discarded while stitching the inferenced chunks
    ## stride_length_s is a tuple of the left and right stride(overlap) length.
    ## With only 1 number, both sides get the same stride, by default
    ## The stride_length on one side is 1/6th of the chunk_length_s if stride_length no provided
    # stride_length_s=[8, 8],
    stride_length_s=[5, 5],
    # stride_length_s=[6,0],
    batch_size=16,
    ignore_warning=True,
    ## force whisper to generate timestamps so that the chunking and stitching can be accurate
    # return_timestamps=True, 
    generate_kwargs = {
                       'language':language, 
                       'task':task, 
                       'repetition_penalty':1.8,
                       'num_beams':2,
                       'max_new_tokens':448,
                       'early_stopping':True,
                    #    'renormalize_logits':True,
                       # [16867]: �, [16867, 156, 100, 235, 156, 12811]: �্র
                       'bad_words_ids':[[16867], [16867, 156, 100, 235, 156, 12811]],
                    #    'supress_tokens': [16867, 156, 100, 235, 156, 12811],
                       }
)


def transcribe(speech_array: Union[str, BinaryIO], language: str = "en") -> str:
    """
    Transcribes an audio array to text
    Args:
        speech_array (np.ndarray): audio in numpy array format
        language (str): "sv" or "en"
    Returns:
        a string containing transcription
    """
    asr.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
    # asr.model.config.max_new_tokens = 448 #default is 448
    
    result = asr(speech_array)

    return str(result["text"])