Spaces:

marquesafonso
/

multilang-asr-captioner

Running

App Files Files Community

marquesafonso commited on Jun 15, 2024

Commit

d63a8c1

1 Parent(s): 97246dc

add translation task. tweak fontsize and max_words_per_line defaults.

Browse files

Files changed (6) hide show

.dockerignore +1 -0
main.py +6 -4
static/submit_video.html +13 -0
utils/archiver.py +3 -2
utils/process_video.py +2 -1
utils/transcriber.py +3 -1

.dockerignore CHANGED Viewed

@@ -4,6 +4,7 @@ __pycache__/
 *.git
 data/
 temp/
 cli.py
 Pipfile
 Pipfile.lock

 *.git
 data/
 temp/
+archive/
 cli.py
 Pipfile
 Pipfile.lock

main.py CHANGED Viewed

@@ -86,14 +86,16 @@ async def get_form():
 @app.post("/process_video/")
 async def process_video_api(video_file: MP4Video = Depends(),
                             srt_file: SRTFile = Depends(),
-                            max_words_per_line: Optional[int] = Form(8),
-                            fontsize: Optional[int] = Form(36),
                             font: Optional[str] = Form("FuturaPTHeavy"),
                             bg_color: Optional[str] = Form("#070a13b3"),
                             text_color: Optional[str] = Form("white"),
                             username: str = Depends(get_current_user)
                             ):
     try:
         logging.info("Creating temporary directories")
         temp_dir = os.path.join(os.getcwd(),"temp")
         os.makedirs(temp_dir, exist_ok=True)
@@ -115,12 +117,12 @@ async def process_video_api(video_file: MP4Video = Depends(),
                 finally:
                     srt_file.file.close()
             logging.info("Processing the video...")
-            output_path, _ = process_video(temp_input_path, SRT_PATH, max_words_per_line, fontsize, font, bg_color, text_color)
             logging.info("Zipping response...")
             zip_path = zip_response(os.path.join(temp_vid_dir,"archive.zip"), [output_path, SRT_PATH])
             return FileResponse(zip_path, media_type='application/zip', filename=f"result_{video_file.filename.split('.')[0]}.zip")
         logging.info("Processing the video...")
-        output_path, srt_path = process_video(temp_input_path, None, max_words_per_line, fontsize, font, bg_color, text_color)
         logging.info("Zipping response...")
         zip_path = zip_response(os.path.join(temp_vid_dir,"archive.zip"), [output_path, srt_path])
         return  FileResponse(zip_path, media_type='application/zip', filename=f"result_{video_file.filename.split('.')[0]}.zip")

 @app.post("/process_video/")
 async def process_video_api(video_file: MP4Video = Depends(),
                             srt_file: SRTFile = Depends(),
+                            task: Optional[str] = Form("transcribe"),
+                            max_words_per_line: Optional[int] = Form(6),
+                            fontsize: Optional[int] = Form(42),
                             font: Optional[str] = Form("FuturaPTHeavy"),
                             bg_color: Optional[str] = Form("#070a13b3"),
                             text_color: Optional[str] = Form("white"),
                             username: str = Depends(get_current_user)
                             ):
     try:
+        print(task)
         logging.info("Creating temporary directories")
         temp_dir = os.path.join(os.getcwd(),"temp")
         os.makedirs(temp_dir, exist_ok=True)
                 finally:
                     srt_file.file.close()
             logging.info("Processing the video...")
+            output_path, _ = process_video(temp_input_path, SRT_PATH, task, max_words_per_line, fontsize, font, bg_color, text_color)
             logging.info("Zipping response...")
             zip_path = zip_response(os.path.join(temp_vid_dir,"archive.zip"), [output_path, SRT_PATH])
             return FileResponse(zip_path, media_type='application/zip', filename=f"result_{video_file.filename.split('.')[0]}.zip")
         logging.info("Processing the video...")
+        output_path, srt_path = process_video(temp_input_path, None, task, max_words_per_line, fontsize, font, bg_color, text_color)
         logging.info("Zipping response...")
         zip_path = zip_response(os.path.join(temp_vid_dir,"archive.zip"), [output_path, srt_path])
         return  FileResponse(zip_path, media_type='application/zip', filename=f"result_{video_file.filename.split('.')[0]}.zip")

static/submit_video.html CHANGED Viewed

@@ -31,6 +31,14 @@
                 border: 1px solid #ddd;
                 box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.1);
             }
             input[type=submit] {
                 width: 25%;
@@ -92,6 +100,11 @@
         <form action="/process_video/" enctype="multipart/form-data" method="post">
             Video File: <input type="file" name="video_file"><br>
             Subtitles File: <input type="file" name="srt_file"><br>
             Max words per line: <input type="number" name="max_words_per_line" value="8"><br>
             Font size: <input type="number" name="fontsize" value="36"><br>
             Font: <input type="text" name="font" value="FuturaPTHeavy"><br>

                 border: 1px solid #ddd;
                 box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.1);
             }
+            select {
+                width: 30%;
+                padding: 10px;
+                margin-bottom: 10px;
+                border-radius: 4px;
+                border: 1px solid #ddd;
+                box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.1);
+            }
             input[type=submit] {
                 width: 25%;
         <form action="/process_video/" enctype="multipart/form-data" method="post">
             Video File: <input type="file" name="video_file"><br>
             Subtitles File: <input type="file" name="srt_file"><br>
+            <label for="task">Task</label>
+            <select id="task" name="task">
+                <option value="transcribe">Transcribe</option>
+                <option value="translate">Translate</option>
+            </select><br>
             Max words per line: <input type="number" name="max_words_per_line" value="8"><br>
             Font size: <input type="number" name="fontsize" value="36"><br>
             Font: <input type="text" name="font" value="FuturaPTHeavy"><br>

utils/archiver.py CHANGED Viewed

@@ -2,14 +2,15 @@ import shutil, os
 from datetime import datetime
 def archiver(timestamp:datetime):
-    ARCHIVE = os.path.abspath(f"archive/{timestamp.year:4d}-{timestamp.month:02d}-{timestamp.day:02d}/")
     TEMP_DIR = os.path.abspath("temp/")
     LOG_FILE = os.path.abspath("main.log")
     if os.path.exists(TEMP_DIR):
         shutil.make_archive(os.path.join(ARCHIVE, "files"), 'zip', TEMP_DIR)
         shutil.rmtree(TEMP_DIR)
     if os.path.exists(LOG_FILE):
-        shutil.copy(LOG_FILE, os.path.join(ARCHIVE, f"{timestamp.year:4d}-{timestamp.month:02d}-{timestamp.day:02d}.log"))
         os.remove(LOG_FILE)
 if __name__ == '__main__':

 from datetime import datetime
 def archiver(timestamp:datetime):
+    TIME = f"{timestamp.year:4d}-{timestamp.month:02d}-{timestamp.day:02d}_{timestamp.hour:02d}-{timestamp.minute:02d}"
+    ARCHIVE = os.path.abspath(f"archive/{TIME}")
     TEMP_DIR = os.path.abspath("temp/")
     LOG_FILE = os.path.abspath("main.log")
     if os.path.exists(TEMP_DIR):
         shutil.make_archive(os.path.join(ARCHIVE, "files"), 'zip', TEMP_DIR)
         shutil.rmtree(TEMP_DIR)
     if os.path.exists(LOG_FILE):
+        shutil.copy(LOG_FILE, os.path.join(ARCHIVE, f"{TIME}.log"))
         os.remove(LOG_FILE)
 if __name__ == '__main__':

utils/process_video.py CHANGED Viewed

@@ -13,6 +13,7 @@ logging.basicConfig(filename='main.log',
 # API Function
 def process_video(invideo_filename:str,
                   srt_path: str,
                   max_words_per_line:int,
                   fontsize:str,
                   font:str,
@@ -33,7 +34,7 @@ def process_video(invideo_filename:str,
     SRT_PATH = os.path.abspath(f"{invideo_filename.split('.')[0]}.srt")
     logging.info("Transcribing...")
     if not os.path.exists(SRT_PATH):
-        transcriber(INAUDIO_PATH, SRT_PATH, max_words_per_line)
     logging.info("Subtitling...")
     subtitler(invideo_filename, SRT_PATH, OUTVIDEO_PATH, fontsize, font, bg_color, text_color)
     return OUTVIDEO_PATH, SRT_PATH

 # API Function
 def process_video(invideo_filename:str,
                   srt_path: str,
+                  task: str,
                   max_words_per_line:int,
                   fontsize:str,
                   font:str,
     SRT_PATH = os.path.abspath(f"{invideo_filename.split('.')[0]}.srt")
     logging.info("Transcribing...")
     if not os.path.exists(SRT_PATH):
+        transcriber(INAUDIO_PATH, SRT_PATH, max_words_per_line, task)
     logging.info("Subtitling...")
     subtitler(invideo_filename, SRT_PATH, OUTVIDEO_PATH, fontsize, font, bg_color, text_color)
     return OUTVIDEO_PATH, SRT_PATH

utils/transcriber.py CHANGED Viewed

@@ -29,13 +29,15 @@ def write_srt(segments, srt_path, max_words_per_line):
 def transcriber(input_path:str,
                 srt_path:str,
-                max_words_per_line:int):
     #TODO: model_size = "distil-large-v3" -> need to wait for new pypi version of faster-whisper (pull request already merged)
     model_size = "large-v3"
     model = WhisperModel(model_size, device="cpu", compute_type="int8") #TODO: add condition_on_previous_text=False when using distil-whisper
     segments, info = model.transcribe(
         input_path,
         beam_size=5,
         vad_filter=True,
         vad_parameters=dict(min_silence_duration_ms=500),
         word_timestamps=True

 def transcriber(input_path:str,
                 srt_path:str,
+                max_words_per_line:int,
+                task:str):
     #TODO: model_size = "distil-large-v3" -> need to wait for new pypi version of faster-whisper (pull request already merged)
     model_size = "large-v3"
     model = WhisperModel(model_size, device="cpu", compute_type="int8") #TODO: add condition_on_previous_text=False when using distil-whisper
     segments, info = model.transcribe(
         input_path,
         beam_size=5,
+        task=task,
         vad_filter=True,
         vad_parameters=dict(min_silence_duration_ms=500),
         word_timestamps=True