Spaces:
Runtime error
Runtime error
File size: 2,920 Bytes
f4b4907 b9fbb26 aa79d2d 74a2d9a dfbe1d3 f4b4907 cb420c2 f4b4907 74a2d9a 609cc91 74a2d9a dfbe1d3 c96eacc dfbe1d3 74a2d9a aa79d2d 4dd7c15 f4b4907 dfbe1d3 b9fbb26 f4b4907 3587687 c96eacc 3587687 b9fbb26 f4b4907 eff408a f4b4907 eff408a f4b4907 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import nltk
import librosa
import torch
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer
nltk.download("punkt")
from transformers import pipeline
import scipy.io.wavfile
import soundfile as sf
from huggingface_hub import HfApi, CommitOperationAdd, CommitOperationDelete
model_name = "Shubham09/whisper31filescheck"
processor = WhisperProcessor.from_pretrained(model_name,task="transcribe")
#tokenizer = WhisperTokenizer.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
def load_data(input_file):
#reading the file
speech, sample_rate = librosa.load(input_file)
#make it 1-D
if len(speech.shape) > 1:
speech = speech[:,0] + speech[:,1]
#Resampling the audio at 16KHz
if sample_rate !=16000:
speech = librosa.resample(speech, sample_rate,16000)
return speech
def write_to_file(input_file):
fs = 16000
sf.write("my_Audio_file.flac",input_file, fs)
api = HfApi()
operations = [
CommitOperationAdd(path_in_repo="my_Audio_file.flac", path_or_fileobj="Shubham09/whisper31filescheck/repo/my_Audio_file.flac"),
# CommitOperationAdd(path_in_repo="weights.h5", path_or_fileobj="~/repo/weights-final.h5"),
# CommitOperationDelete(path_in_repo="old-weights.h5"),
# CommitOperationDelete(path_in_repo="logs/"),
]
#scipy.io.wavfile.write("microphone-result.wav")
# with open("microphone-results.wav", "wb") as f:
# f.write(input_file.get_wav_data())
# import base64
# wav_file = open("temp.wav", "wb")
# decode_string = base64.b64decode(input_file)
# wav_file.write(decode_string)
pipe = pipeline(model="Shubham09/whisper31filescheck") # change to "your-username/the-name-you-picked"
def asr_transcript(audio):
audio = "Shubham09/whisper31filescheck/repo/my_Audio_file.flac"
text = pipe(audio)["text"]
return text
# speech = load_data(input_file)
# #Tokenize
# input_features = processor(speech).input_features #, padding="longest" , return_tensors="pt"
# #input_values = tokenizer(speech, return_tensors="pt").input_values
# #Take logits
# logits = model(input_features).logits
# #Take argmax
# predicted_ids = torch.argmax(logits, dim=-1)
# #Get the words from predicted word ids
# transcription = processor.batch_decode(predicted_ids)
# #Correcting the letter casing
# #transcription = correct_casing(transcription.lower())
# return transcription
gr.Interface(asr_transcript,
inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker"),
outputs = gr.outputs.Textbox(label="Output Text"),
title="ASR using Whisper",
description = "This application displays transcribed text for given audio input",
examples = [["Actuator.wav"], ["anomalies.wav"]], theme="grass").launch()
|