File size: 4,306 Bytes
9e156fa
4f0841f
 
9e156fa
4f0841f
 
 
9e156fa
4f0841f
9e156fa
 
4f0841f
 
 
 
 
 
 
 
 
9e156fa
4f0841f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
745d185
 
4f0841f
1aee38e
4f0841f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e156fa
4f0841f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e156fa
4f0841f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
from faster_whisper import WhisperModel
import logging
import os
from moviepy.editor import VideoFileClip
import ffmpeg  # Make sure to install ffmpeg-python
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import pysrt
import requests

# Configure logging for debugging purposes
logging.basicConfig()
logging.getLogger("faster_whisper").setLevel(logging.DEBUG)

# Fetch and parse language options from the provided URL
url = "https://huggingface.co/Lenylvt/LanguageISO/resolve/main/iso.md"
df = pd.read_csv(url, delimiter="|", skiprows=2, header=None).dropna(axis=1, how='all')
df.columns = ['ISO 639-1', 'ISO 639-2', 'Language Name', 'Native Name']
df['ISO 639-1'] = df['ISO 639-1'].str.strip()

# Prepare language options for the dropdown
language_options = [(row['ISO 639-1'], f"{row['Language Name']} ({row['ISO 639-1']})") for index, row in df.iterrows()]

def format_timestamp(seconds):
    """Convert seconds to HH:MM:SS.mmm format."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds_remainder = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}"

def extract_audio(video_path):
    """Extract audio from video to a temporary audio file."""
    output_audio_path = '/tmp/audio.wav'
    ffmpeg.input(video_path).output(output_audio_path, acodec='pcm_s16le', ac=1, ar='16k').run(quiet=True)
    return output_audio_path

def transcribe_and_optionally_translate(video_file, source_language, target_language, model_size, allow_modification):
    audio_file = extract_audio(video_file)
    
    # Transcription
    device = "cpu"  # GPU : cuda  CPU : cpu
    compute_type = "int8"  # GPU : float16 or int8 - CPU : int8
    model = WhisperModel(model_size, device=device, compute_type=compute_type)
    segments, _ = model.transcribe(audio_file)
    transcription = " ".join([segment.text for segment in segments])
    
    # Translation
    if source_language != target_language:
        model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        translated = model.generate(**tokenizer(transcription, return_tensors="pt", padding=True, truncation=True, max_length=512))
        transcription = tokenizer.decode(translated[0], skip_special_tokens=True)
    
    return transcription, allow_modification

def add_hard_subtitle_to_video(input_video, transcript):
    """Add hard subtitles to video."""
    temp_subtitle_path = '/tmp/subtitle.srt'
    with open(temp_subtitle_path, 'w', encoding='utf-8') as file:
        file.write(transcript)  # Assuming transcript is in SRT format
    
    output_video_path = f"/tmp/output_video.mp4"
    ffmpeg.input(input_video).output(output_video_path, vf=f"subtitles={temp_subtitle_path}").run(quiet=True)
    
    return output_video_path

# Gradio Interface
def process_video(video, source_language, target_language, model_size='base', allow_modification=False, modified_transcript=None):
    transcript, can_modify = transcribe_and_optionally_translate(video, source_language, target_language, model_size, allow_modification)
    
    if can_modify and modified_transcript:
        transcript = modified_transcript  # Use the modified transcript if provided
    
    # Add hard subtitles to the video
    output_video = add_hard_subtitle_to_video(video, transcript)
    return output_video

# Setup the Gradio app
app = gr.Interface(
    fn=process_video,
    inputs=[
        gr.Video(label="Upload Video"),
        gr.Dropdown(choices=language_options, label="Source Language"),
        gr.Dropdown(choices=language_options, label="Target Language"),
        gr.Dropdown(choices=["base", "small", "medium", "large", "large-v2", "large-v3"], label="Model Size"),
        gr.Checkbox(label="Allow Transcript Modification?", value=False),
        gr.TextArea(label="Modified Transcript (if allowed)")
    ],
    outputs=gr.Video(label="Processed Video with Hard Subtitles"),
    title="Video Transcription and Translation Tool",
    description="Transcribe or translate your video content. Optionally, edit the transcription before adding hard subtitles."
)

if __name__ == "__main__":
    app.launch()