File size: 2,920 Bytes
f4b4907
 
 
 
 
 
b9fbb26
aa79d2d
74a2d9a
dfbe1d3
f4b4907
 
cb420c2
 
 
f4b4907
 
 
 
 
 
 
 
 
 
 
 
74a2d9a
609cc91
74a2d9a
 
dfbe1d3
 
c96eacc
dfbe1d3
 
 
 
74a2d9a
 
 
 
aa79d2d
 
4dd7c15
 
 
 
f4b4907
 
dfbe1d3
b9fbb26
f4b4907
3587687
c96eacc
3587687
b9fbb26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4b4907
 
 
 
eff408a
f4b4907
eff408a
f4b4907
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import nltk
import librosa
import torch
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer
nltk.download("punkt")
from transformers import pipeline
import scipy.io.wavfile
import soundfile as sf
from huggingface_hub import HfApi, CommitOperationAdd, CommitOperationDelete

model_name = "Shubham09/whisper31filescheck"
processor = WhisperProcessor.from_pretrained(model_name,task="transcribe")
#tokenizer = WhisperTokenizer.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

def load_data(input_file):

  #reading the file
  speech, sample_rate = librosa.load(input_file)
  #make it 1-D
  if len(speech.shape) > 1: 
      speech = speech[:,0] + speech[:,1]
  #Resampling the audio at 16KHz
  if sample_rate !=16000:
    speech = librosa.resample(speech, sample_rate,16000)
  return speech
    
def write_to_file(input_file):
    fs = 16000
    sf.write("my_Audio_file.flac",input_file, fs)
    api = HfApi()
operations = [
    CommitOperationAdd(path_in_repo="my_Audio_file.flac", path_or_fileobj="Shubham09/whisper31filescheck/repo/my_Audio_file.flac"),
    # CommitOperationAdd(path_in_repo="weights.h5", path_or_fileobj="~/repo/weights-final.h5"),
    # CommitOperationDelete(path_in_repo="old-weights.h5"),
    # CommitOperationDelete(path_in_repo="logs/"),
]


    
    #scipy.io.wavfile.write("microphone-result.wav")
    # with open("microphone-results.wav", "wb") as f:
    #     f.write(input_file.get_wav_data())
    # import base64
    # wav_file = open("temp.wav", "wb")
    # decode_string = base64.b64decode(input_file)
    # wav_file.write(decode_string)  



pipe = pipeline(model="Shubham09/whisper31filescheck")  # change to "your-username/the-name-you-picked"

def asr_transcript(audio):
    audio = "Shubham09/whisper31filescheck/repo/my_Audio_file.flac"
    text = pipe(audio)["text"]
    return text

  # speech = load_data(input_file)
  # #Tokenize
  # input_features = processor(speech).input_features  #, padding="longest" , return_tensors="pt"
  # #input_values = tokenizer(speech, return_tensors="pt").input_values
  # #Take logits
  # logits = model(input_features).logits
  # #Take argmax
  # predicted_ids = torch.argmax(logits, dim=-1)
  # #Get the words from predicted word ids
  # transcription = processor.batch_decode(predicted_ids)
  # #Correcting the letter casing
  # #transcription = correct_casing(transcription.lower())
  # return transcription

gr.Interface(asr_transcript,
             inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker"),
             outputs = gr.outputs.Textbox(label="Output Text"),
             title="ASR using Whisper",
             description = "This application displays transcribed text for given audio input",
             examples = [["Actuator.wav"], ["anomalies.wav"]], theme="grass").launch()