File size: 3,069 Bytes
e002d92
7cb0b8e
e002d92
41ec54b
7cb0b8e
e002d92
41ec54b
e002d92
41ec54b
 
9dc25d9
41ec54b
e002d92
41ec54b
 
 
 
 
e002d92
41ec54b
 
 
 
 
 
 
 
 
 
 
 
 
d4af723
41ec54b
 
6ddf7cf
e002d92
6ddf7cf
362c063
e002d92
 
9c9b591
 
6ddf7cf
9c9b591
cb035db
 
 
 
362c063
cb035db
 
 
 
 
41ec54b
1bf6da9
 
 
d4af723
9895fa7
41ec54b
e002d92
cb035db
41ec54b
e002d92
9895fa7
 
6ddf7cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pysrt
import gradio as gr
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

# Fetch and parse language options from the provided URL
url = "https://huggingface.co/Lenylvt/LanguageISO/resolve/main/iso.md"
df = pd.read_csv(url, delimiter="|", skiprows=2, header=None).dropna(axis=1, how='all')
df.columns = ['ISO 639-1', 'ISO 639-2', 'Language Name', 'Native Name']
df['ISO 639-1'] = df['ISO 639-1'].str.strip()

# Prepare language options for the dropdown
language_options = [(row['ISO 639-1'], f"{row['ISO 639-1']}") for index, row in df.iterrows()]

def translate_text(text, source_language_code, target_language_code):
    # Construct model name using ISO 639-1 codes
    model_name = f"Helsinki-NLP/opus-mt-{source_language_code}-{target_language_code}"

    # Check if source and target languages are the same, which is not supported for translation
    if source_language_code == target_language_code:
        return "Translation between the same languages is not supported."

    # Load tokenizer and model
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
    except Exception as e:
        return f"Failed to load model for {source_language_code} to {target_language_code}: {str(e)}"

    # Translate text
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512))
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    
    return translated_text

def translate_srt(input_file, source_language_code, target_language_code, progress=gr.Progress()):
    # Load SRT file
    subs = pysrt.open(input_file.name)

    # Initialize an empty list to store translated subtitles
    translated_subs = []

    # Translate each subtitle
    for idx, sub in enumerate(subs):
        translated_text = translate_text(sub.text, source_language_code, target_language_code)
        # Construct the translated subtitle with timestamp and line number
        translated_sub = pysrt.SubRipItem(index=idx+1, start=sub.start, end=sub.end, text=translated_text)
        translated_subs.append(translated_sub)
        progress((idx + 1) / len(subs), desc=f"Translating subtitle {idx+1}/{len(subs)}")

    # Save translated subtitles to a new SRT file
    translated_file = pysrt.SubRipFile(translated_subs)
    translated_srt_path = input_file.name.replace(".srt", f"_{target_language_code}.srt")
    translated_file.save(translated_srt_path)
    return translated_srt_path

source_language_dropdown = gr.Dropdown(choices=language_options, label="Source Language")
target_language_dropdown = gr.Dropdown(choices=language_options, label="Target Language")
file_input = gr.File(label="Upload SRT File")

iface = gr.Interface(
    fn=translate_srt,
    inputs=[file_input, source_language_dropdown, target_language_dropdown],
    outputs=gr.File(label="Translated SRT"),
    title="SRT Translator",
    description="Translate subtitles from one language to another."
)

iface.launch()