Spaces:

ChiBenevisamPas
/

Video-Subtitle-Translate

Runtime error

App Files Files Community

ChiBenevisamPas commited on Oct 14, 2024

Commit

4bc13cd

verified ·

1 Parent(s): 9a2739a

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -59

app.py CHANGED Viewed

@@ -1,34 +1,54 @@
 import gradio as gr
 import whisper
 import os
-from transformers import MarianMTModel, MarianTokenizer
 from docx import Document  # For Word output
 from fpdf import FPDF  # For PDF output
 from pptx import Presentation  # For PowerPoint output
 # Load the Whisper model
-model = whisper.load_model("base")  # Choose 'tiny', 'base', 'small', 'medium', or 'large'
-# Load MarianMT translation model for different languages
 def load_translation_model(target_language):
-    lang_models = {
-        "fa": "Helsinki-NLP/opus-mt-en-fa",  # English to Persian (Farsi)
-        "es": "Helsinki-NLP/opus-mt-en-es",  # English to Spanish
-        "fr": "Helsinki-NLP/opus-mt-en-fr",  # English to French
     }
-    model_name = lang_models.get(target_language)
-    if not model_name:
-        raise ValueError(f"Translation model for {target_language} not found")
-    tokenizer = MarianTokenizer.from_pretrained(model_name)
-    translation_model = MarianMTModel.from_pretrained(model_name)
     return tokenizer, translation_model
 def translate_text(text, tokenizer, model):
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-    translated = model.generate(**inputs)
-    return tokenizer.decode(translated[0], skip_special_tokens=True)
 def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
     with open(output_file, "w") as f:
         for i, segment in enumerate(transcription['segments']):
@@ -39,89 +59,120 @@ def write_srt(transcription, output_file, tokenizer=None, translation_model=None
             if translation_model:
                 text = translate_text(text, tokenizer, translation_model)
-            start_time = whisper.utils.format_timestamp(start)
-            end_time = whisper.utils.format_timestamp(end)
             f.write(f"{i + 1}\n")
             f.write(f"{start_time} --> {end_time}\n")
             f.write(f"{text.strip()}\n\n")
-def save_as_word(transcription, file_name, tokenizer=None, translation_model=None):
     doc = Document()
-    doc.add_heading('Video Subtitles', 0)
-    for segment in transcription['segments']:
         text = segment['text']
         if translation_model:
             text = translate_text(text, tokenizer, translation_model)
-        doc.add_paragraph(text.strip())
-    word_file = f"{file_name}.docx"
-    doc.save(word_file)
-    return word_file
-def save_as_pdf(transcription, file_name, tokenizer=None, translation_model=None):
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
-    pdf.cell(200, 10, txt="Video Subtitles", ln=True, align="C")
-    for segment in transcription['segments']:
         text = segment['text']
         if translation_model:
             text = translate_text(text, tokenizer, translation_model)
-        pdf.multi_cell(200, 10, txt=f"{text.strip()}\n")
-    pdf_file = f"{file_name}.pdf"
-    pdf.output(pdf_file)
-    return pdf_file
-def save_as_powerpoint(transcription, file_name, tokenizer=None, translation_model=None):
-    prs = Presentation()
-    slide_layout = prs.slide_layouts[1]  # Title and Content layout
-    for segment in transcription['segments']:
         text = segment['text']
         if translation_model:
             text = translate_text(text, tokenizer, translation_model)
-        slide = prs.slides.add_slide(slide_layout)
         title = slide.shapes.title
-        body = slide.shapes.placeholders[1]
-        title.text = "Subtitle"
-        body.text = text.strip()
-    ppt_file = f"{file_name}.pptx"
-    prs.save(ppt_file)
-    return ppt_file
 def transcribe_video(video_file, language, target_language, output_format):
     result = model.transcribe(video_file.name, language=language)
     video_name = os.path.splitext(video_file.name)[0]
     # Load the translation model for the selected subtitle language
     if target_language != "en":
-        tokenizer, translation_model = load_translation_model(target_language)
     else:
         tokenizer, translation_model = None, None
     if output_format == "SRT":
-        srt_file = f"{video_name}.srt"
-        write_srt(result, srt_file, tokenizer, translation_model)
         return srt_file
     elif output_format == "Word":
-        return save_as_word(result, video_name, tokenizer, translation_model)
     elif output_format == "PDF":
-        return save_as_pdf(result, video_name, tokenizer, translation_model)
     elif output_format == "PowerPoint":
-        return save_as_powerpoint(result, video_name, tokenizer, translation_model)
 # Gradio interface
 iface = gr.Interface(
     fn=transcribe_video,
@@ -129,12 +180,12 @@ iface = gr.Interface(
         gr.File(label="Upload Video"),
         gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
         gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
-        gr.Radio(label="Output Format", choices=["SRT", "Word", "PDF", "PowerPoint"], value="SRT")  # Added output format selection
     ],
-    outputs=gr.File(label="Download Subtitles"),
-    title="Video Subtitle Generator with Translation",
-    description="Upload a video file to generate subtitles in various formats (SRT, Word, PDF, or PowerPoint) using Whisper and MarianMT for translation."
 )
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import whisper
 import os
+from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 from docx import Document  # For Word output
 from fpdf import FPDF  # For PDF output
 from pptx import Presentation  # For PowerPoint output
+import subprocess  # To use ffmpeg for embedding subtitles
+import shlex  # For better command-line argument handling
 # Load the Whisper model
+model = whisper.load_model("tiny")  # Smaller model for faster transcription
+# Load M2M100 translation model for different languages
 def load_translation_model(target_language):
+    lang_codes = {
+        "fa": "fa",  # Persian (Farsi)
+        "es": "es",  # Spanish
+        "fr": "fr",  # French
     }
+    target_lang_code = lang_codes.get(target_language)
+    if not target_lang_code:
+        raise ValueError(f"Translation model for {target_language} not supported")
+    # Load M2M100 model and tokenizer
+    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+    translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+    tokenizer.src_lang = "en"
+    tokenizer.tgt_lang = target_lang_code
     return tokenizer, translation_model
 def translate_text(text, tokenizer, model):
+    try:
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang))
+        return tokenizer.decode(translated[0], skip_special_tokens=True)
+    except Exception as e:
+        raise RuntimeError(f"Error during translation: {e}")
+# Helper function to format timestamps in SRT format (hh:mm:ss,ms)
+def format_timestamp(seconds):
+    milliseconds = int((seconds % 1) * 1000)
+    seconds = int(seconds)
+    hours = seconds // 3600
+    minutes = (seconds % 3600) // 60
+    seconds = seconds % 60
+    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
+# Corrected write_srt function
 def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
     with open(output_file, "w") as f:
         for i, segment in enumerate(transcription['segments']):
             if translation_model:
                 text = translate_text(text, tokenizer, translation_model)
+            start_time = format_timestamp(start)
+            end_time = format_timestamp(end)
             f.write(f"{i + 1}\n")
             f.write(f"{start_time} --> {end_time}\n")
             f.write(f"{text.strip()}\n\n")
+def embed_hardsub_in_video(video_file, srt_file, output_video):
+    """Uses ffmpeg to burn subtitles into the video (hardsub)."""
+    command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
+    try:
+        print(f"Running command: {command}")  # Debug statement
+        process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
+        print(f"ffmpeg output: {process.stdout}")  # Debug statement
+        if process.returncode != 0:
+            raise RuntimeError(f"ffmpeg error: {process.stderr}")  # Print the error
+    except subprocess.TimeoutExpired:
+        raise RuntimeError("ffmpeg process timed out.")
+    except Exception as e:
+        raise RuntimeError(f"Error running ffmpeg: {e}")
+def write_word(transcription, output_file, tokenizer=None, translation_model=None):
+    """Creates a Word document from the transcription."""
     doc = Document()
+    for i, segment in enumerate(transcription['segments']):
+        start = segment['start']
+        end = segment['end']
         text = segment['text']
         if translation_model:
             text = translate_text(text, tokenizer, translation_model)
+        doc.add_paragraph(f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}")
+    doc.save(output_file)
+def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
+    """Creates a PDF document from the transcription."""
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     pdf.add_page()
     pdf.set_font("Arial", size=12)
+    for i, segment in enumerate(transcription['segments']):
+        start = segment['start']
+        end = segment['end']
         text = segment['text']
         if translation_model:
             text = translate_text(text, tokenizer, translation_model)
+        pdf.multi_cell(0, 10, f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}")
+    pdf.output(output_file)
+def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
+    """Creates a PowerPoint presentation from the transcription."""
+    ppt = Presentation()
+    for i, segment in enumerate(transcription['segments']):
+        start = segment['start']
+        end = segment['end']
         text = segment['text']
         if translation_model:
             text = translate_text(text, tokenizer, translation_model)
+        slide = ppt.slides.add_slide(ppt.slide_layouts[5])  # Blank slide
         title = slide.shapes.title
+        title.text = f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}"
+    ppt.save(output_file)
 def transcribe_video(video_file, language, target_language, output_format):
+    # Transcribe the video with Whisper
     result = model.transcribe(video_file.name, language=language)
     video_name = os.path.splitext(video_file.name)[0]
     # Load the translation model for the selected subtitle language
     if target_language != "en":
+        try:
+            tokenizer, translation_model = load_translation_model(target_language)
+        except Exception as e:
+            raise RuntimeError(f"Error loading translation model: {e}")
     else:
         tokenizer, translation_model = None, None
+    # Save the SRT file
+    srt_file = f"{video_name}.srt"
+    write_srt(result, srt_file, tokenizer, translation_model)
+    # Output based on user's selection
     if output_format == "SRT":
         return srt_file
+    elif output_format == "Video with Hardsub":
+        output_video = f"{video_name}_with_subtitles.mp4"
+        try:
+            embed_hardsub_in_video(video_file.name, srt_file, output_video)
+            return output_video
+        except Exception as e:
+            raise RuntimeError(f"Error embedding subtitles in video: {e}")
     elif output_format == "Word":
+        word_file = f"{video_name}.docx"
+        write_word(result, word_file, tokenizer, translation_model)
+        return word_file
     elif output_format == "PDF":
+        pdf_file = f"{video_name}.pdf"
+        write_pdf(result, pdf_file, tokenizer, translation_model)
+        return pdf_file
     elif output_format == "PowerPoint":
+        ppt_file = f"{video_name}.pptx"
+        write_ppt(result, ppt_file, tokenizer, translation_model)
+        return ppt_file
 # Gradio interface
 iface = gr.Interface(
     fn=transcribe_video,
         gr.File(label="Upload Video"),
         gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
         gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
+        gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
     ],
+    outputs=gr.File(label="Download Subtitles, Video, or Document"),
+    title="Video Subtitle Generator with Hardsub and Document Formats",
+    description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
 )
 if __name__ == "__main__":
+    iface.launch()