Spaces:

Hemg
/

AUTOMATIC-SPEECH-RECOGNITION

Runtime error

App Files Files Community

Hemg commited on Apr 18, 2024

Commit

c394a78

verified ·

1 Parent(s): 5149cfb

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -91

app.py CHANGED Viewed

@@ -1,92 +1,8 @@
-# import torch
-# from transformers import pipeline
-# import gradio as gr
-# MODEL_NAME = "Hemg/ASRr"
-# BATCH_SIZE = 8
-# device = 0 if torch.cuda.is_available() else "cpu"
-# pipe = pipeline(
-#     task="automatic-speech-recognition",
-#     model=MODEL_NAME,
-#     chunk_length_s=30,
-#     device=device,
-#     return_timestamps='word'
-# )
-# # Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
-# def format_timestamp(
-#     seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
-# ):
-#     if seconds is not None:
-#         milliseconds = round(seconds * 1000.0)
-#         hours = milliseconds // 3_600_000
-#         milliseconds -= hours * 3_600_000
-#         minutes = milliseconds // 60_000
-#         milliseconds -= minutes * 60_000
-#         seconds = milliseconds // 1_000
-#         milliseconds -= seconds * 1_000
-#         hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
-#         return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
-#     else:
-#         # we have a malformed timestamp so just return it as is
-#         return seconds
-# def transcribe(file, return_timestamps):
-#     outputs = pipe(
-#         file,
-#         batch_size=BATCH_SIZE,
-#         return_timestamps=return_timestamps,
-#     )
-#     text = outputs["text"]
-#     if return_timestamps:
-#         timestamps = outputs["chunks"]
-#         timestamps = [
-#             f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
-#             for chunk in timestamps
-#         ]
-#         text = "\n".join(str(feature) for feature in timestamps)
-#     return text
-# demo = gr.Interface(
-#     fn=transcribe,
-#     inputs=[
-#         #gr.Audio(label="Audio", type="filepath"),
-#         gr.Audio(sources=["upload", "microphone"], type="filepath"),
-#         gr.Checkbox(label="Return timestamps"),
-#     ],
-#     outputs=gr.Textbox(show_copy_button=True, label="Text"),
-#     title="Automatic Speech Recognition",
-#     examples=[
-#         ["examples/example.wav", False],
-#         ["examples/example.wav", True],
-#     ],
-#     cache_examples=True,
-#     allow_flagging="never",
-# )
-# demo.launch()
 import torch
 from transformers import pipeline
 import gradio as gr
-MODEL_NAME = "JackismyShephard/whisper-tiny-finetuned-minds14"
 BATCH_SIZE = 8
 device = 0 if torch.cuda.is_available() else "cpu"
@@ -94,27 +10,64 @@ device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     device=device,
 )
-def transcribe(file):
-    outputs = pipe(file, batch_size=BATCH_SIZE)
-    text = " ".join([output['transcription'] for output in outputs])
     return text
 demo = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.Audio(sources=["upload", "microphone"], type="filepath"),
     ],
     outputs=gr.Textbox(show_copy_button=True, label="Text"),
     title="Automatic Speech Recognition",
     examples=[
-        ["examples/example.wav"],
     ],
     cache_examples=True,
     allow_flagging="never",
@@ -122,3 +75,10 @@ demo = gr.Interface(
 demo.launch()

 import torch
 from transformers import pipeline
 import gradio as gr
+MODEL_NAME = "Hemg/ASRr"
 BATCH_SIZE = 8
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
+    chunk_length_s=30,
     device=device,
+    return_timestamps='word'
 )
+# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
+def format_timestamp(
+    seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
+):
+    if seconds is not None:
+        milliseconds = round(seconds * 1000.0)
+        hours = milliseconds // 3_600_000
+        milliseconds -= hours * 3_600_000
+        minutes = milliseconds // 60_000
+        milliseconds -= minutes * 60_000
+        seconds = milliseconds // 1_000
+        milliseconds -= seconds * 1_000
+        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+        return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    else:
+        # we have a malformed timestamp so just return it as is
+        return seconds
+def transcribe(file, return_timestamps):
+    outputs = pipe(
+        file,
+        batch_size=BATCH_SIZE,
+        return_timestamps=return_timestamps,
+    )
+    text = outputs["text"]
+    if return_timestamps:
+        timestamps = outputs["chunks"]
+        timestamps = [
+            f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
+            for chunk in timestamps
+        ]
+        text = "\n".join(str(feature) for feature in timestamps)
     return text
 demo = gr.Interface(
     fn=transcribe,
     inputs=[
+        #gr.Audio(label="Audio", type="filepath"),
         gr.Audio(sources=["upload", "microphone"], type="filepath"),
+        gr.Checkbox(label="Return timestamps"),
     ],
     outputs=gr.Textbox(show_copy_button=True, label="Text"),
     title="Automatic Speech Recognition",
     examples=[
+        ["examples/example.wav", False],
+        ["examples/example.wav", True],
     ],
     cache_examples=True,
     allow_flagging="never",
 demo.launch()