File size: 2,421 Bytes
50c4728
 
 
 
 
 
83ebf54
50c4728
 
 
 
 
 
 
 
 
83ebf54
50c4728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83ebf54
dfa9b1d
 
 
 
 
 
50c4728
 
 
83ebf54
50c4728
dfa9b1d
50c4728
 
83ebf54
50c4728
 
 
 
dfa9b1d
50c4728
 
 
dfa9b1d
38e25e4
50c4728
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import gradio as gr
import time
from moviepy.editor import VideoFileClip
from faster_whisper import WhisperModel

# λΉ„λ””μ˜€λ₯Ό MP3둜 λ³€ν™˜ν•˜λŠ” ν•¨μˆ˜
def convert_mp4_to_mp3(video_file_path, output_dir):
    video = VideoFileClip(video_file_path)
    audio = video.audio
    output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(video_file_path))[0] + ".mp3")
    audio.write_audiofile(output_path)
    audio.close()
    video.close()
    return output_path

# Whisper λͺ¨λΈμ„ μ‚¬μš©ν•˜μ—¬ MP3 νŒŒμΌμ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜λŠ” ν•¨μˆ˜
def transcribe_audio(model_size, audio_file):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    start_time = time.time()

    try:
        segments, info = model.transcribe(audio_file, beam_size=5)

        detected_language = "Detected language '%s' with probability %f" % (info.language, info.language_probability)
        result = []
        for segment in segments:
            result.append("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
        result_text = "\n".join(result)

    except PermissionError as e:
        return f"PermissionError: {e}"
    except ValueError as e:
        return f"ValueError: {e}"

    end_time = time.time()
    elapsed_time = end_time - start_time

    return f"{detected_language}\n\nTranscription:\n{result_text}\n\nElapsed time: {elapsed_time:.2f} seconds"

# Gradio μΈν„°νŽ˜μ΄μŠ€μ—μ„œ μ‚¬μš©ν•  메인 ν•¨μˆ˜
def process_video(model_size, video_file=None):
    if not video_file:
        return "Please upload a video file."

    video_file_path = video_file.name
    print(f"Using uploaded video file: {video_file_path}")

    save_path = "/tmp"
    mp3_file_path = convert_mp4_to_mp3(video_file_path, save_path)
    print(f"Converted video to MP3: {mp3_file_path}")
    transcription = transcribe_audio(model_size, mp3_file_path)
    print("Transcription complete")
    return transcription

# Gradio μΈν„°νŽ˜μ΄μŠ€ μ •μ˜
iface = gr.Interface(
    fn=process_video,
    inputs=[
        gr.Dropdown(["tiny", "base", "small", "medium", "large"], label="Model Size"),
        gr.File(label="Upload Video File")
    ],
    outputs="text",
    title="Video to Text Converter using Whisper",
    description="Upload a video file, select the Whisper model size, and get the transcribed text.",
    live=True
)

if __name__ == "__main__":
    iface.launch()