Spaces:

hubsnippetai
/

medimage

Sleeping

App Files Files Community

hubsnippetai commited on Jun 7, 2024

Commit

2036c34

verified ·

1 Parent(s): 2238241

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -8

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-from transformers import pipeline
 import gradio as gr
 import datetime
@@ -29,10 +29,24 @@ pipe = pipeline(
 """
 # call a text generation model to display the audio content after identifying the word(s) in the text output
-#import torch
-#from transformers import pipeline
-#from datasets import load_dataset
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
@@ -42,7 +56,7 @@ pipe = pipeline(
     chunk_length_s=30,
     device=device,
 )
 # ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 # sample = ds[0]["audio"]
@@ -52,9 +66,19 @@ pipe = pipeline(
 #prediction = pipe(sample.copy(), batch_size=8, return_timestamps=True)["chunks"]
-def audio2text(audio_file, prompt : str | list):
-    prediction = pipe(audio_file, batch_size=8, return_timestamps=True)["chunks"]
     #prediction=pipe(audio_file)
-    return prediction['text']
 gr.Interface(fn=audio2text, inputs=[gr.Audio(label='upload your audio file', sources='upload', type='filepath'), gr.Textbox(label="provide word(s) to search for")], outputs=[gr.Textbox(label="transcription")]).launch()

 import torch
 # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
 import gradio as gr
 import datetime
 """
 # call a text generation model to display the audio content after identifying the word(s) in the text output
+# import torch
+# from transformers import pipeline
+# from datasets import load_dataset
+# from transformers import WhisperProcessor, WhisperForConditionalGeneration
+# from datasets import load_dataset
+# load model and processor
+processor = WhisperProcessor.from_pretrained("microsoft/whisper-base-webnn")
+model = WhisperForConditionalGeneration.from_pretrained("microsoft/whisper-base-webnn")
+model.config.forced_decoder_ids = None
+# load dummy dataset and read audio files
+# ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+# sample = ds[0]["audio"]
+"""
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     chunk_length_s=30,
     device=device,
 )
+"""
 # ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 # sample = ds[0]["audio"]
 #prediction = pipe(sample.copy(), batch_size=8, return_timestamps=True)["chunks"]
+def audio2text(audio_file, prompt : list):
+    input_features = processor(audio_file, sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
+    # generate token ids
+    predicted_ids = model.generate(input_features)
+    # decode token ids to text
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
+    # transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+    # prediction = pipe(audio_file, batch_size=8, return_timestamps=True)["chunks"]
     #prediction=pipe(audio_file)
+    return transcription['text']
 gr.Interface(fn=audio2text, inputs=[gr.Audio(label='upload your audio file', sources='upload', type='filepath'), gr.Textbox(label="provide word(s) to search for")], outputs=[gr.Textbox(label="transcription")]).launch()