File size: 7,869 Bytes
bf8c5cb
 
 
 
 
 
 
44bb4ee
bf8c5cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44bb4ee
bf8c5cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44bb4ee
bf8c5cb
 
 
 
 
 
 
 
 
 
 
 
44bb4ee
bf8c5cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import whisper
import gradio as gr
import os
import datetime

#获取当前北京时间
utc_dt = datetime.datetime.utcnow()
beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=10)))
formatted = beijing_dt.strftime("%Y-%m-%d_%H")
print(f"北京时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
#创建作品存放目录
works_path = '../works_audio_video_transcribe/' + formatted
if not os.path.exists(works_path):
  os.makedirs(works_path)
print('作品目录:' + works_path)

#model_size = "small"
#model = whisper.load_model(model_size) #tiny、base、small、medium(可用)、large

def transcript(model_size, audiofile, prompt, output_dir):
    os.system(f"whisper {audiofile} --model {model_size} --language zh --initial_prompt {prompt} --output_dir {output_dir}")

def audio_recog(model_size, audiofile):
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=10)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
    print(f"开始时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
      
    print("音频文件:" + audiofile)
    
    prompt = "以下是普通话的句子"
    filename = os.path.splitext(os.path.basename(audiofile))[0]
    text_file = works_path + '/' + filename + '.txt'
    srt_file = works_path + '/' + filename + '.srt'
    
    output_dir = works_path
    transcript(model_size, audiofile, prompt, output_dir)
    with open(text_file, "r") as f:
        text_output = f.read()
        print("text:" + text_output)
    print("text文件:" + text_file)

    with open(srt_file, "r") as f:
        srt_output = f.read()
        print("srt:" + srt_output)
    print("srt文件:" + srt_file)
    
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=10)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S")
    print(f"结束时间: {beijing_dt.year}{beijing_dt.month}{beijing_dt.day}日 "
      f"{beijing_dt.hour}{beijing_dt.minute}{beijing_dt.second}秒")
    
    return text_output, text_file, srt_output, srt_file

def video_recog(model_size, filepath):
    filename = os.path.splitext(os.path.basename(filepath))[0]
    worksfile = works_path + '/works_' + filename + '.mp4'
    print("视频文件:" + filepath)
    
    utc_dt = datetime.datetime.utcnow()
    beijing_dt = utc_dt.astimezone(datetime.timezone(datetime.timedelta(hours=10)))
    formatted = beijing_dt.strftime("%Y-%m-%d_%H-%M-%S.%f")
    
    # 提取音频为mp3
    audiofile = works_path + '/' + formatted + '.mp3'
    os.system(f"ffmpeg -i {filepath} -vn -c:a libmp3lame -q:a 4 {audiofile}")
    
    #识别音频文件
    text_output, text_file, srt_output, srt_file = audio_recog(model_size, audiofile)
    
#    # 给视频添加字幕
#    os.system(f"ffmpeg -i {filepath} -i {srt_file} -c:s mov_text -c:v copy -c:a copy {worksfile}")
#    print("作品:" + worksfile)
    
    return text_output, text_file, srt_output, srt_file

css_style = "#fixed_size_img {height: 240px;} " \
            "#overview {margin: auto;max-width: 400px; max-height: 400px;}"

title = "音视频转录 by宁侠"
description = "您只需要上传一段音频或视频文件,我们的服务会快速对其进行语音识别,然后生成相应的文字和字幕。这样,您就可以轻松地记录下重要的语音内容,或者为视频添加精准的字幕。现在就来试试我们的音视频转录服务吧,让您的生活和工作更加便捷!"

examples_path = 'examples/'
examples = [[examples_path + 'demo_shejipuhui.mp4']]

# gradio interface
with gr.Blocks(title=title, css=css_style) as demo:
    gr.HTML('''
      <div style="text-align: center; max-width: 720px; margin: 0 auto;">
                  <div
                    style="
                      display: inline-flex;
                      align-items: center;
                      gap: 0.8rem;
                      font-size: 1.75rem;
                    "
                  >
                    <h1 style="font-family:  PingFangSC; font-weight: 500; font-size: 36px; margin-bottom: 7px;">
                      音视频转录
                    </h1>
                    <h1 style="font-family: PingFangSC; font-weight: 500; line-height: 1.5em; font-size: 16px; margin-bottom: 7px;">
                      by宁侠
                    </h1>
      ''')
    gr.Markdown(description)
    
    with gr.Tab("🔊音频转录 Audio Transcribe"):
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(label="🔊音频输入 Audio Input", type="filepath")
                gr.Examples(['examples/paddlespeech.asr-zh.wav', 'examples/demo_shejipuhui.mp3'], [audio_input])
                audio_model_size = gr.components.Radio(label="模型尺寸", choices=["tiny", "base", "small", "medium", "large"], value="small")
                audio_recog_button = gr.Button("👂音频识别 Recognize")
            with gr.Column():
                audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
                audio_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
                audio_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
                audio_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
                audio_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
                audio_output = gr.Audio(label="🔊音频 Audio", visible=False)
    
    audio_recog_button.click(audio_recog, inputs=[audio_model_size, audio_input], outputs=[audio_text_output, audio_text_file, audio_srt_output, audio_srt_file])
#    audio_subtitles_button.click(audio_subtitles, inputs=[audio_text_input], outputs=[audio_output])

    with gr.Tab("🎥视频转录 Video Transcribe"):
        with gr.Row():
            with gr.Column():
                video_input = gr.Video(label="🎥视频输入 Video Input")
                gr.Examples(['examples/demo_shejipuhui.mp4'], [video_input], label='语音识别示例 ASR Demo')
                video_model_size = gr.components.Radio(label="模型尺寸", choices=["tiny", "base", "small", "medium", "large"], value="small")
                video_recog_button = gr.Button("👂视频识别 Recognize")
                video_output = gr.Video(label="🎥视频 Video", visible=False)
            with gr.Column():
                video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result", max_lines=5)
                video_text_file = gr.File(label="✏️识别结果文件 Recognition Result File")
                video_srt_output = gr.Textbox(label="📖SRT字幕内容 SRT Subtitles", max_lines=10)
                video_srt_file = gr.File(label="📖SRT字幕文件 SRT File")
                with gr.Row(visible=False):
                    font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
                    font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
                video_subtitles_button = gr.Button("添加字幕\nGenerate Subtitles", visible=False)
                
    
    video_recog_button.click(video_recog, inputs=[video_model_size, video_input], outputs=[video_text_output, video_text_file, video_srt_output, video_srt_file])
#    video_subtitles_button.click(video_subtitles, inputs=[video_text_input], outputs=[video_output])

# start gradio service in local
demo.queue(api_open=False).launch(debug=True)