caspr

Paused

App Files Files Community

artificialguybr commited on Jan 28, 2024

Commit

a1bbd1f

verified ·

1 Parent(s): b6ac968

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -23

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ import scipy
 from googletrans import Translator
 import re
 import subprocess
 ZipFile("ffmpeg.zip").extractall()
 st = os.stat('ffmpeg')
@@ -25,9 +26,6 @@ with open('google_lang_codes.json', 'r') as f:
     google_lang_codes = json.load(f)
 translator = Translator()
-#tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B")
-#model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
 whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16")
 print("cwd", os.getcwd())
@@ -42,37 +40,39 @@ def process_video(Video, target_language):
     run(["ffmpeg", "-version"])
     audio_file = f"{common_uuid}.wav"
     run(["ffmpeg", "-i", Video, audio_file])
     # Transcription with Whisper.
-    print("Iniciando transcrição com Whisper")
-    segments, _ = whisper_model.transcribe(audio_file, beam_size=5)
-    segments = list(segments)
-    transcript_file = f"{current_path}/{common_uuid}.srt"
     # Create a list to hold the translated lines.
     translated_lines = []
     with open(transcript_file, "w+", encoding="utf-8") as f:
         counter = 1
-        for segment in segments:
-            start_hours = int(segment.start // 3600)
-            start_minutes = int((segment.start % 3600) // 60)
-            start_seconds = int(segment.start % 60)
-            start_milliseconds = int((segment.start - int(segment.start)) * 1000)
-            end_hours = int(segment.end // 3600)
-            end_minutes = int((segment.end % 3600) // 60)
-            end_seconds = int(segment.end % 60)
-            end_milliseconds = int((segment.end - int(segment.end)) * 1000)
-            formatted_start = f"{start_hours:02d}:{start_minutes:02d}:{start_seconds:02d},{start_milliseconds:03d}"
-            formatted_end = f"{end_hours:02d}:{end_minutes:02d}:{end_seconds:02d},{end_milliseconds:03d}"
             f.write(f"{counter}\n")
             f.write(f"{formatted_start} --> {formatted_end}\n")
-            f.write(f"{segment.text}\n\n")
             counter += 1
         # Move the file pointer to the beginning of the file.
         f.seek(0)

 from googletrans import Translator
 import re
 import subprocess
+import datetime
 ZipFile("ffmpeg.zip").extractall()
 st = os.stat('ffmpeg')
     google_lang_codes = json.load(f)
 translator = Translator()
 whisper_model = WhisperModel("large-v2", device="cuda", compute_type="float16")
 print("cwd", os.getcwd())
     run(["ffmpeg", "-version"])
     audio_file = f"{common_uuid}.wav"
     run(["ffmpeg", "-i", Video, audio_file])
+    transcript_file = f"{common_uuid}.srt"
     # Transcription with Whisper.
+    print("Starting transcription with Whisper with word-level timestamps and VAD filter")
+    segments, _ = whisper_model.transcribe(audio_file, word_timestamps=True, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))
+    # Process each segment and word for detailed timestamping
+    transcript_with_timestamps = []
+    for segment in segments:
+        for word in segment.words:
+            start_time = f"{word.start:.2f}"
+            end_time = f"{word.end:.2f}"
+            transcript_with_timestamps.append(f"[{start_time}s -> {end_time}s] {word.word}")
     # Create a list to hold the translated lines.
     translated_lines = []
     with open(transcript_file, "w+", encoding="utf-8") as f:
         counter = 1
+        for line in transcript_with_timestamps:
+            # Extract timestamp and word from the line
+            timestamp, word = re.match(r"\[(.*?)s -> (.*?)s\] (.*)", line).groups()
+            start_time, end_time = timestamp.split(' -> ')
+            # Convert timestamps to SRT format
+            formatted_start = str(datetime.timedelta(seconds=float(start_time)))
+            formatted_end = str(datetime.timedelta(seconds=float(end_time)))
+            # Write to SRT file
             f.write(f"{counter}\n")
             f.write(f"{formatted_start} --> {formatted_end}\n")
+            f.write(f"{word}\n\n")
             counter += 1
         # Move the file pointer to the beginning of the file.
         f.seek(0)