File size: 3,304 Bytes
22b4650
 
d1baf01
 
22b4650
d66d670
22b4650
67ee310
d1baf01
 
22b4650
d1baf01
 
 
67ee310
22b4650
 
 
 
8a53bbd
22b4650
 
 
 
 
 
 
 
 
 
 
 
 
 
001993c
22b4650
001993c
22b4650
 
 
 
9e5c5ec
 
 
 
 
e08c550
9e5c5ec
 
 
 
 
 
8a53bbd
9e5c5ec
 
 
 
 
 
 
7355ad5
001993c
0d8f3bf
001993c
 
0d8f3bf
001993c
 
 
7355ad5
001993c
8a53bbd
 
22b4650
 
001993c
22b4650
001993c
22b4650
 
8a53bbd
 
22b4650
001993c
 
 
22b4650
0d7fd71
001993c
 
 
22b4650
0d8f3bf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from transformers import pipeline
import gradio as gr
import os
import subprocess
from pytube import YouTube
 
pipe = pipeline(model="tilos/whisper-small-zh-HK")  # change to "your-username/the-name-you-picked"

def video2mp3(video_file, output_ext="mp3"):
    filename, ext = os.path.splitext(video_file)
    subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"],
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
    return f"{filename}.{output_ext}"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text


def get_text(url):
    result = pipe(get_audio(url))
    return result['text'].strip()

def get_audio(url):
    website = YouTube(url)
    video = website.streams.filter(only_audio=True).first()
    out_file = video.download(output_path=".")
    base, ext = os.path.splitext(out_file)
    new_file = base + '.mp3'
    os.rename(out_file, new_file)
    audio = new_file
    return audio

def offline_video(video):
    audio_file = video2mp3(video)
    text = transcribe(audio_file)
    return text


with gr.Blocks() as demo:

    # video file input
    gr.Interface(
            title="Whisper: Real Time Cantonese Recognition",
            description="Realtime demo for Cantonese speech recognition using a fine-tuned Whisper small model. "
                        "Generate zh-HK subtitle from video file, audio file, your microphone, and Youtube URL",
            fn=offline_video,
            inputs="video",
            outputs="text",
            allow_flagging="never",
        )

    # audio file input
    with gr.Row():
        with gr.Column():
            input_audio = gr.Audio(source="upload", type="filepath")
            micro_btn = gr.Button('Generate Voice Subtitles')
        with gr.Column():
            output_audio = gr.Textbox(placeholder='Transcript from audio', label='Subtitles')
            micro_btn.click(transcribe, inputs=input_audio, outputs=output_audio)
    """
    gr.Interface(
            fn=transcribe,
            title="Whisper: zh-HK Subtitle Generator",
            description="Generate zh-HK subtitle from audio file, your microphone and Youtube",
            inputs = gr.Audio(source="upload", type="filepath", optional=True),
            outputs = "text",
            allow_flagging= "never",
    )
    """

    # microphone input
    with gr.Row():
        with gr.Column():
            input_mircro = gr.Audio(source="microphone", type="filepath")
            micro_btn = gr.Button('Generate Voice Subtitles')
        with gr.Column():
            output_micro = gr.Textbox(placeholder='Transcript from mic', label='Subtitles')
            micro_btn.click(transcribe, inputs=input_mircro, outputs=output_micro)

    # Youtube url input
    with gr.Row():
        with gr.Column():
            inputs_url = gr.Textbox(placeholder='Youtube URL', label='URL')
            url_btn = gr.Button('Generate Youtube Video Subtitles')
            examples = gr.Examples(examples=["https://www.youtube.com/watch?v=Yw4EoGWe0vw"],inputs=[inputs_url])
        with gr.Column():
            output_url = gr.Textbox(placeholder='Transcript from video.', label='Transcript')
            url_btn.click(get_text, inputs=inputs_url, outputs=output_url )



demo.launch(debug=True)