Spaces:
Sleeping
Sleeping
from transformers import WhisperTokenizer | |
import os | |
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe" | |
from transformers import pipeline | |
import gradio as gr | |
import torch | |
import torchaudio | |
pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", | |
task="automatic-speech-recognition", | |
tokenizer= tokenizer, | |
) # change to "your-username/the-name-you-picked" | |
# pipe.model.config.forced_decoder_ids = ( | |
# pipe.tokenizer.get_decoder_prompt_ids( | |
# language="marathi", task="transcribe" | |
# ) | |
# ) | |
# def transcribe_speech(filepath): | |
# # waveform, sample_rate = torchaudio.load(filepath) | |
# # Resample the audio signal to 16k sampling rate | |
# # resampler = torchaudio.transforms.Resample(sample_rate, 16000) | |
# # waveform_16k = resampler(waveform) | |
# # Save the resampled audio signal to a new file | |
# # torchaudio.save(filepath, waveform_16k, 16000) | |
# output = pipe( | |
# filepath, | |
# max_new_tokens=3, | |
# generate_kwargs={ | |
# "task": "transcribe", | |
# # "language": "konkani", | |
# }, # update with the language you've fine-tuned on | |
# chunk_length_s=30, | |
# batch_size=8, | |
# # sampling_rate=16000, | |
# # padding=True | |
# ) | |
# print(output) | |
# return output["text"] | |
def transcribe_speech(filepath): | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
import torch | |
import librosa | |
# Load model and processor | |
model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3") | |
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") | |
processor = WhisperProcessor.from_pretrained("openai/whisper-small") | |
output = "" | |
# Load and preprocess audio | |
audio_path = filepath | |
audio, sr = librosa.load(audio_path, sr=16000) | |
input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features | |
# Check length and process | |
if input_features.shape[-1] > 3000: | |
print("Splitting audio required") | |
# from pydub import AudioSegment | |
# def split_audio(file_path, chunk_length_ms=30000): # 30 sec chunks | |
# audio = AudioSegment.from_file(file_path) | |
# chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)] | |
# return chunks | |
# # Split and transcribe | |
# audio_chunks = split_audio(audio_path) | |
# for i, chunk in enumerate(audio_chunks): | |
# print(i) | |
# chunk.export(f"chunk_{i}.wav", format="wav") | |
# result = pipe(f"chunk_{i}.wav") | |
# output += result['text'] + " " | |
# print(f"Chunk {i}: {result['text']}") | |
else: | |
predicted_ids = model.generate(input_features) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |
output = transcription | |
print(transcription) | |
return output #output["text"] | |
demo = gr.Blocks() | |
mic_transcribe = gr.Interface( | |
fn=transcribe_speech, | |
inputs=gr.Audio(sources="microphone", type="filepath"), | |
outputs=gr.components.Textbox(), | |
) | |
file_transcribe = gr.Interface( | |
fn=transcribe_speech, | |
inputs=gr.Audio(sources="upload", type="filepath"), | |
outputs=gr.components.Textbox(), | |
examples=[ | |
[os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")], | |
[os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")], | |
[os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")], | |
[os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")], | |
], | |
) | |
with demo: | |
gr.TabbedInterface( | |
[mic_transcribe, file_transcribe], | |
["Transcribe Microphone", "Transcribe Audio File"], | |
) | |
demo.launch(debug=True) | |
# # def transcribe(audio): | |
# # # text = pipe(audio)["text"] | |
# # # pipe(audio) | |
# # text = pipe(audio) | |
# # print("op",text) | |
# # return text#pipe(audio) #text | |
# # iface = gr.Interface( | |
# # fn=transcribe, | |
# # inputs=[gr.Audio(sources=["microphone", "upload"])], | |
# # outputs="text", | |
# # examples=[ | |
# # [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")], | |
# # [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")], | |
# # [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")], | |
# # ], | |
# # title="Whisper Konkani", | |
# # description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", | |
# # ) | |
# # iface.launch() | |
# from transformers import WhisperTokenizer, pipeline | |
# import gradio as gr | |
# import os | |
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe") | |
# pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer) | |
# def transcribe(audio): | |
# result = pipe(audio) | |
# text = result[0]['text'] | |
# print("op", text) | |
# return text | |
# iface = gr.Interface( | |
# fn=transcribe, | |
# inputs=[gr.Audio(sources=["microphone", "upload"])], | |
# outputs="text", | |
# examples=[ | |
# [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")], | |
# [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")], | |
# [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")], | |
# ], | |
# title="Whisper Konkani", | |
# description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.", | |
# ) | |
# iface.launch() |