Spaces:

piecurus
/

speech_to_text

Runtime error

App Files Files Community

piecurus commited on Feb 2, 2022

Commit

da0005f

1 Parent(s): 540f7a6

added functionality for long text

Browse files

Files changed (1) hide show

app.py +163 -42

app.py CHANGED Viewed

@@ -1,66 +1,187 @@
-#References: 1. https://www.kdnuggets.com/2021/03/speech-text-wav2vec.html
-            #2. https://www.youtube.com/watch?v=4CoVcsxZphE
-            #3. https://www.analyticsvidhya.com/blog/2021/02/hugging-face-introduces-the-first-automatic-speech-recognition-model-wav2vec2/
 #Importing all the necessary packages
 import nltk
 import librosa
 import torch
 import gradio as gr
 from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
 nltk.download("punkt")
 #Loading the model and the tokenizer
 model_name = "facebook/wav2vec2-base-960h"
-tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
 def load_data(input_file):
-  """ Function for resampling to ensure that the speech input is sampled at 16KHz.
-  """
-  #read the file
-  speech, sample_rate = librosa.load(input_file)
-  #make it 1-D
-  if len(speech.shape) > 1:
-      speech = speech[:,0] + speech[:,1]
-  #Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
-  if sample_rate !=16000:
-    speech = librosa.resample(speech, sample_rate,16000)
-  return speech
 def correct_casing(input_sentence):
-  """ This function is for correcting the casing of the generated transcribed text
-  """
-  sentences = nltk.sent_tokenize(input_sentence)
-  return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
 def asr_transcript(input_file):
-  """This function generates transcripts for the provided audio input
-  """
-  speech = load_data(input_file)
-  #Tokenize
-  input_values = tokenizer(speech, return_tensors="pt").input_values
-  #Take logits
-  logits = model(input_values).logits
-  #Take argmax
-  predicted_ids = torch.argmax(logits, dim=-1)
-  #Get the words from predicted word ids
-  transcription = tokenizer.decode(predicted_ids[0])
-  #Output is all upper case
-  transcription = correct_casing(transcription.lower())
-  return transcription
-gr.Interface(asr_transcript,
-             inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Please record your voice"),
              outputs = gr.outputs.Textbox(label="Output Text"),
              title="ASR using Wav2Vec 2.0",
              description = "This application displays transcribed text for given audio input",
              examples = [["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch()

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+# conver mp3 to wav
+# ffmpeg -i test_5.mp3 -b:a 16000 test_5.wav
+# In[1]:
 #Importing all the necessary packages
 import nltk
 import librosa
+import IPython.display
 import torch
 import gradio as gr
 from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
 nltk.download("punkt")
+# In[2]:
 #Loading the model and the tokenizer
 model_name = "facebook/wav2vec2-base-960h"
+#model_name = "facebook/wav2vec2-large-xlsr-53"
+tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)#omdel_name
 model = Wav2Vec2ForCTC.from_pretrained(model_name)
+# In[3]:
 def load_data(input_file):
+    """ Function for resampling to ensure that the speech input is sampled at 16KHz.
+    """
+    #read the file
+    speech, sample_rate = librosa.load(input_file)
+    #make it 1-D
+    if len(speech.shape) > 1:
+        speech = speech[:,0] + speech[:,1]
+    #Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
+    if sample_rate !=16000:
+        speech = librosa.resample(speech, sample_rate,16000)
+    #speeches = librosa.effects.split(speech)
+    return speech
+# In[4]:
 def correct_casing(input_sentence):
+    """ This function is for correcting the casing of the generated transcribed text
+    """
+    sentences = nltk.sent_tokenize(input_sentence)
+    return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
+# In[5]:
 def asr_transcript(input_file):
+    """This function generates transcripts for the provided audio input
+    """
+    speech = load_data(input_file)
+    #Tokenize
+    input_values = tokenizer(speech, return_tensors="pt").input_values
+    #Take logits
+    logits = model(input_values).logits
+    #Take argmax
+    predicted_ids = torch.argmax(logits, dim=-1)
+    #Get the words from predicted word ids
+    transcription = tokenizer.decode(predicted_ids[0])
+    #Output is all upper case
+    transcription = correct_casing(transcription.lower())
+    return transcription
+# In[6]:
+def asr_transcript_long(input_file,tokenizer=tokenizer, model=model ):
+    transcript = ""
+    # Ensure that the sample rate is 16k
+    sample_rate = librosa.get_samplerate(input_file)
+    # Stream over 30 seconds chunks rather than load the full file
+    stream = librosa.stream(
+        input_file,
+        block_length=30,
+        frame_length=sample_rate, #16000,
+        hop_length=sample_rate, #16000
+    )
+    for speech in stream:
+        if len(speech.shape) > 1:
+            speech = speech[:, 0] + speech[:, 1]
+        if sample_rate !=16000:
+            speech = librosa.resample(speech, sample_rate,16000)
+        input_values = tokenizer(speech, return_tensors="pt").input_values
+        logits = model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = tokenizer.decode(predicted_ids[0])
+        #transcript += correct_sentence(transcription.lower())
+        transcript += correct_casing(transcription.lower())
+        transcript += " "
+    return transcript
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from pydub.playback import play
+sound = AudioSegment.from_file("./test_2.wav", format="wav")
+chunks = split_on_silence(
+    sound,
+    # split on silences longer than 1000ms (1 sec)
+    min_silence_len=5000,
+    # anything under -16 dBFS is considered silence
+    silence_thresh=-32,
+    # keep 200 ms of leading/trailing silence
+    keep_silence=500
+)#read the file
+speech, sample_rate = librosa.load('./test_2.wav')
+#make it 1-D
+if len(speech.shape) > 1:
+    speech = speech[:,0] + speech[:,1]
+#Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
+if sample_rate !=16000:
+    speech = librosa.resample(speech, sample_rate,16000)
+part_of_speech = librosa.effects.split(speech)idx = -1
+IPython.display.Audio(data=speech[part_of_speech[idx,0]:part_of_speech[idx,1]], rate=16000)
+# In[ ]:
+gr.Interface(asr_transcript_long,
+             #inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Please record your voice"),
+             inputs = gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Upload your file here"),
              outputs = gr.outputs.Textbox(label="Output Text"),
              title="ASR using Wav2Vec 2.0",
              description = "This application displays transcribed text for given audio input",
              examples = [["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch()
+# In[ ]:
+# In[ ]:
+# In[ ]:
+# In[7]:
+#temp = asr_transcript_long('./test_2.wav')
+# In[ ]:
+# In[ ]: