ChiBenevisamPas commited on
Commit
4bc13cd
·
verified ·
1 Parent(s): 9a2739a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -59
app.py CHANGED
@@ -1,34 +1,54 @@
1
  import gradio as gr
2
  import whisper
3
  import os
4
- from transformers import MarianMTModel, MarianTokenizer
5
  from docx import Document # For Word output
6
  from fpdf import FPDF # For PDF output
7
  from pptx import Presentation # For PowerPoint output
 
 
8
 
9
  # Load the Whisper model
10
- model = whisper.load_model("base") # Choose 'tiny', 'base', 'small', 'medium', or 'large'
11
 
12
- # Load MarianMT translation model for different languages
13
  def load_translation_model(target_language):
14
- lang_models = {
15
- "fa": "Helsinki-NLP/opus-mt-en-fa", # English to Persian (Farsi)
16
- "es": "Helsinki-NLP/opus-mt-en-es", # English to Spanish
17
- "fr": "Helsinki-NLP/opus-mt-en-fr", # English to French
18
  }
19
- model_name = lang_models.get(target_language)
20
- if not model_name:
21
- raise ValueError(f"Translation model for {target_language} not found")
22
-
23
- tokenizer = MarianTokenizer.from_pretrained(model_name)
24
- translation_model = MarianMTModel.from_pretrained(model_name)
 
 
 
 
 
25
  return tokenizer, translation_model
26
 
27
  def translate_text(text, tokenizer, model):
28
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
29
- translated = model.generate(**inputs)
30
- return tokenizer.decode(translated[0], skip_special_tokens=True)
 
 
 
31
 
 
 
 
 
 
 
 
 
 
 
32
  def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
33
  with open(output_file, "w") as f:
34
  for i, segment in enumerate(transcription['segments']):
@@ -39,89 +59,120 @@ def write_srt(transcription, output_file, tokenizer=None, translation_model=None
39
  if translation_model:
40
  text = translate_text(text, tokenizer, translation_model)
41
 
42
- start_time = whisper.utils.format_timestamp(start)
43
- end_time = whisper.utils.format_timestamp(end)
 
44
  f.write(f"{i + 1}\n")
45
  f.write(f"{start_time} --> {end_time}\n")
46
  f.write(f"{text.strip()}\n\n")
47
 
48
- def save_as_word(transcription, file_name, tokenizer=None, translation_model=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  doc = Document()
50
- doc.add_heading('Video Subtitles', 0)
51
- for segment in transcription['segments']:
 
52
  text = segment['text']
53
 
54
  if translation_model:
55
  text = translate_text(text, tokenizer, translation_model)
56
 
57
- doc.add_paragraph(text.strip())
58
-
59
- word_file = f"{file_name}.docx"
60
- doc.save(word_file)
61
- return word_file
62
 
63
- def save_as_pdf(transcription, file_name, tokenizer=None, translation_model=None):
 
64
  pdf = FPDF()
65
  pdf.set_auto_page_break(auto=True, margin=15)
66
  pdf.add_page()
67
  pdf.set_font("Arial", size=12)
68
- pdf.cell(200, 10, txt="Video Subtitles", ln=True, align="C")
69
 
70
- for segment in transcription['segments']:
 
 
71
  text = segment['text']
72
 
73
  if translation_model:
74
  text = translate_text(text, tokenizer, translation_model)
75
 
76
- pdf.multi_cell(200, 10, txt=f"{text.strip()}\n")
77
 
78
- pdf_file = f"{file_name}.pdf"
79
- pdf.output(pdf_file)
80
- return pdf_file
81
 
82
- def save_as_powerpoint(transcription, file_name, tokenizer=None, translation_model=None):
83
- prs = Presentation()
84
- slide_layout = prs.slide_layouts[1] # Title and Content layout
85
 
86
- for segment in transcription['segments']:
 
 
87
  text = segment['text']
88
 
89
  if translation_model:
90
  text = translate_text(text, tokenizer, translation_model)
91
 
92
- slide = prs.slides.add_slide(slide_layout)
93
  title = slide.shapes.title
94
- body = slide.shapes.placeholders[1]
95
-
96
- title.text = "Subtitle"
97
- body.text = text.strip()
98
 
99
- ppt_file = f"{file_name}.pptx"
100
- prs.save(ppt_file)
101
- return ppt_file
102
 
103
  def transcribe_video(video_file, language, target_language, output_format):
 
104
  result = model.transcribe(video_file.name, language=language)
105
-
106
  video_name = os.path.splitext(video_file.name)[0]
107
 
108
  # Load the translation model for the selected subtitle language
109
  if target_language != "en":
110
- tokenizer, translation_model = load_translation_model(target_language)
 
 
 
111
  else:
112
  tokenizer, translation_model = None, None
113
-
 
 
 
 
 
114
  if output_format == "SRT":
115
- srt_file = f"{video_name}.srt"
116
- write_srt(result, srt_file, tokenizer, translation_model)
117
  return srt_file
 
 
 
 
 
 
 
118
  elif output_format == "Word":
119
- return save_as_word(result, video_name, tokenizer, translation_model)
 
 
120
  elif output_format == "PDF":
121
- return save_as_pdf(result, video_name, tokenizer, translation_model)
 
 
122
  elif output_format == "PowerPoint":
123
- return save_as_powerpoint(result, video_name, tokenizer, translation_model)
124
-
 
 
125
  # Gradio interface
126
  iface = gr.Interface(
127
  fn=transcribe_video,
@@ -129,12 +180,12 @@ iface = gr.Interface(
129
  gr.File(label="Upload Video"),
130
  gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
131
  gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
132
- gr.Radio(label="Output Format", choices=["SRT", "Word", "PDF", "PowerPoint"], value="SRT") # Added output format selection
133
  ],
134
- outputs=gr.File(label="Download Subtitles"),
135
- title="Video Subtitle Generator with Translation",
136
- description="Upload a video file to generate subtitles in various formats (SRT, Word, PDF, or PowerPoint) using Whisper and MarianMT for translation."
137
  )
138
 
139
  if __name__ == "__main__":
140
- iface.launch()
 
1
  import gradio as gr
2
  import whisper
3
  import os
4
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
  from docx import Document # For Word output
6
  from fpdf import FPDF # For PDF output
7
  from pptx import Presentation # For PowerPoint output
8
+ import subprocess # To use ffmpeg for embedding subtitles
9
+ import shlex # For better command-line argument handling
10
 
11
  # Load the Whisper model
12
+ model = whisper.load_model("tiny") # Smaller model for faster transcription
13
 
14
+ # Load M2M100 translation model for different languages
15
  def load_translation_model(target_language):
16
+ lang_codes = {
17
+ "fa": "fa", # Persian (Farsi)
18
+ "es": "es", # Spanish
19
+ "fr": "fr", # French
20
  }
21
+ target_lang_code = lang_codes.get(target_language)
22
+ if not target_lang_code:
23
+ raise ValueError(f"Translation model for {target_language} not supported")
24
+
25
+ # Load M2M100 model and tokenizer
26
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
27
+ translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
28
+
29
+ tokenizer.src_lang = "en"
30
+ tokenizer.tgt_lang = target_lang_code
31
+
32
  return tokenizer, translation_model
33
 
34
  def translate_text(text, tokenizer, model):
35
+ try:
36
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
37
+ translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang))
38
+ return tokenizer.decode(translated[0], skip_special_tokens=True)
39
+ except Exception as e:
40
+ raise RuntimeError(f"Error during translation: {e}")
41
 
42
+ # Helper function to format timestamps in SRT format (hh:mm:ss,ms)
43
+ def format_timestamp(seconds):
44
+ milliseconds = int((seconds % 1) * 1000)
45
+ seconds = int(seconds)
46
+ hours = seconds // 3600
47
+ minutes = (seconds % 3600) // 60
48
+ seconds = seconds % 60
49
+ return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
50
+
51
+ # Corrected write_srt function
52
  def write_srt(transcription, output_file, tokenizer=None, translation_model=None):
53
  with open(output_file, "w") as f:
54
  for i, segment in enumerate(transcription['segments']):
 
59
  if translation_model:
60
  text = translate_text(text, tokenizer, translation_model)
61
 
62
+ start_time = format_timestamp(start)
63
+ end_time = format_timestamp(end)
64
+
65
  f.write(f"{i + 1}\n")
66
  f.write(f"{start_time} --> {end_time}\n")
67
  f.write(f"{text.strip()}\n\n")
68
 
69
+ def embed_hardsub_in_video(video_file, srt_file, output_video):
70
+ """Uses ffmpeg to burn subtitles into the video (hardsub)."""
71
+ command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"'
72
+
73
+ try:
74
+ print(f"Running command: {command}") # Debug statement
75
+ process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300)
76
+ print(f"ffmpeg output: {process.stdout}") # Debug statement
77
+ if process.returncode != 0:
78
+ raise RuntimeError(f"ffmpeg error: {process.stderr}") # Print the error
79
+ except subprocess.TimeoutExpired:
80
+ raise RuntimeError("ffmpeg process timed out.")
81
+ except Exception as e:
82
+ raise RuntimeError(f"Error running ffmpeg: {e}")
83
+
84
+ def write_word(transcription, output_file, tokenizer=None, translation_model=None):
85
+ """Creates a Word document from the transcription."""
86
  doc = Document()
87
+ for i, segment in enumerate(transcription['segments']):
88
+ start = segment['start']
89
+ end = segment['end']
90
  text = segment['text']
91
 
92
  if translation_model:
93
  text = translate_text(text, tokenizer, translation_model)
94
 
95
+ doc.add_paragraph(f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}")
96
+ doc.save(output_file)
 
 
 
97
 
98
+ def write_pdf(transcription, output_file, tokenizer=None, translation_model=None):
99
+ """Creates a PDF document from the transcription."""
100
  pdf = FPDF()
101
  pdf.set_auto_page_break(auto=True, margin=15)
102
  pdf.add_page()
103
  pdf.set_font("Arial", size=12)
 
104
 
105
+ for i, segment in enumerate(transcription['segments']):
106
+ start = segment['start']
107
+ end = segment['end']
108
  text = segment['text']
109
 
110
  if translation_model:
111
  text = translate_text(text, tokenizer, translation_model)
112
 
113
+ pdf.multi_cell(0, 10, f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}")
114
 
115
+ pdf.output(output_file)
 
 
116
 
117
+ def write_ppt(transcription, output_file, tokenizer=None, translation_model=None):
118
+ """Creates a PowerPoint presentation from the transcription."""
119
+ ppt = Presentation()
120
 
121
+ for i, segment in enumerate(transcription['segments']):
122
+ start = segment['start']
123
+ end = segment['end']
124
  text = segment['text']
125
 
126
  if translation_model:
127
  text = translate_text(text, tokenizer, translation_model)
128
 
129
+ slide = ppt.slides.add_slide(ppt.slide_layouts[5]) # Blank slide
130
  title = slide.shapes.title
131
+ title.text = f"{i + 1}. [{format_timestamp(start)} - {format_timestamp(end)}] {text.strip()}"
 
 
 
132
 
133
+ ppt.save(output_file)
 
 
134
 
135
  def transcribe_video(video_file, language, target_language, output_format):
136
+ # Transcribe the video with Whisper
137
  result = model.transcribe(video_file.name, language=language)
 
138
  video_name = os.path.splitext(video_file.name)[0]
139
 
140
  # Load the translation model for the selected subtitle language
141
  if target_language != "en":
142
+ try:
143
+ tokenizer, translation_model = load_translation_model(target_language)
144
+ except Exception as e:
145
+ raise RuntimeError(f"Error loading translation model: {e}")
146
  else:
147
  tokenizer, translation_model = None, None
148
+
149
+ # Save the SRT file
150
+ srt_file = f"{video_name}.srt"
151
+ write_srt(result, srt_file, tokenizer, translation_model)
152
+
153
+ # Output based on user's selection
154
  if output_format == "SRT":
 
 
155
  return srt_file
156
+ elif output_format == "Video with Hardsub":
157
+ output_video = f"{video_name}_with_subtitles.mp4"
158
+ try:
159
+ embed_hardsub_in_video(video_file.name, srt_file, output_video)
160
+ return output_video
161
+ except Exception as e:
162
+ raise RuntimeError(f"Error embedding subtitles in video: {e}")
163
  elif output_format == "Word":
164
+ word_file = f"{video_name}.docx"
165
+ write_word(result, word_file, tokenizer, translation_model)
166
+ return word_file
167
  elif output_format == "PDF":
168
+ pdf_file = f"{video_name}.pdf"
169
+ write_pdf(result, pdf_file, tokenizer, translation_model)
170
+ return pdf_file
171
  elif output_format == "PowerPoint":
172
+ ppt_file = f"{video_name}.pptx"
173
+ write_ppt(result, ppt_file, tokenizer, translation_model)
174
+ return ppt_file
175
+
176
  # Gradio interface
177
  iface = gr.Interface(
178
  fn=transcribe_video,
 
180
  gr.File(label="Upload Video"),
181
  gr.Dropdown(label="Select Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"),
182
  gr.Dropdown(label="Select Subtitle Language", choices=["en", "fa", "es", "fr"], value="fa"),
183
+ gr.Radio(label="Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub")
184
  ],
185
+ outputs=gr.File(label="Download Subtitles, Video, or Document"),
186
+ title="Video Subtitle Generator with Hardsub and Document Formats",
187
+ description="Upload a video file to generate subtitles in SRT format, download the video with hardsubbed subtitles, or generate Word, PDF, or PowerPoint documents using Whisper and M2M100 for translation."
188
  )
189
 
190
  if __name__ == "__main__":
191
+ iface.launch()