whisper-large-v3

Sangmin commited on Nov 8, 2023

Commit

c5012b6

•

1 Parent(s): cb5005d

Add an option to choose language

Let users specify language for the audio/video contents.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -83,7 +83,7 @@ def download_yt_audio(yt_url, filename):
             raise gr.Error(str(err))
-def yt_transcribe(yt_url, task, return_timestamps, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
@@ -94,8 +94,12 @@ def yt_transcribe(yt_url, task, return_timestamps, max_filesize=75.0):
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
     if return_timestamps:
         return html_embed_str, chunks_to_srt(result['chunks'])
@@ -111,6 +115,7 @@ mf_transcribe = gr.Interface(
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
         gr.inputs.Checkbox(label="Return timestamps"),
     ],
     outputs="text",
     layout="horizontal",

             raise gr.Error(str(err))
+def yt_transcribe(yt_url, task, return_timestamps, language, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    # Map the language names to their corresponding codes
+    language_codes = {"English": "en", "Korean": "ko", "Japanese": "ja"}
+    language_code = language_codes.get(language, "en")  # Default to "en" if the language is not found
+    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task, "language": f"<|{language_code}|>"}, return_timestamps=return_timestamps)
     if return_timestamps:
         return html_embed_str, chunks_to_srt(result['chunks'])
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
         gr.inputs.Checkbox(label="Return timestamps"),
+        gr.inputs.Dropdown(choices=["English", "Korean", "Japanese"], label="Language"),
     ],
     outputs="text",
     layout="horizontal",