marquesafonso commited on
Commit
80b7d93
·
1 Parent(s): 7f3ee84

"add wordlevel highlighting feature (wip)"

Browse files
main.py CHANGED
@@ -16,12 +16,12 @@ from fastapi.security import HTTPBasic
16
  from pydantic import BaseModel, field_validator
17
  from cachetools import TTLCache
18
 
19
- ## THIS IS A BREAKING CHANGE. SRT FILE INPUT DEPRECATED. WIP.
20
- ## DONE: separate transcriber from subtitler logic. WIP.
21
- ## DONE: improve loading spinner. WIP (with redirect)
22
  ## DONE: fix tempdir cleanup
23
  ## DONE: add transcription preview component + allow for interactive validation of transcription in-browser.
24
- ## TODO: add word level highlighting option
25
  ## TODO: improve UI
26
 
27
  app = FastAPI()
@@ -79,10 +79,14 @@ async def transcribe_api(video_file: MP4Video = Depends(),
79
  with open(video_path, 'wb') as f:
80
  shutil.copyfileobj(video_file.file, f)
81
 
82
- transcription = transcriber(video_path, max_words_per_line, task, model_version)
83
 
84
  uid = str(uuid4())
85
- cache[uid] = {"video_path": video_path, "transcription": transcription, "temp_dir_path": temp_dir.name}
 
 
 
 
86
  return RedirectResponse(url=f"/process_settings/?uid={uid}", status_code=303)
87
 
88
  except Exception as e:
@@ -95,7 +99,8 @@ async def process_settings(request: Request, uid: str):
95
  raise HTTPException(404, "Data not found")
96
  return templates.TemplateResponse("process_settings.html", {
97
  "request": request,
98
- "transcription": data["transcription"],
 
99
  "video_path": data["video_path"],
100
  "temp_dir_path": data["temp_dir_path"]
101
  })
@@ -104,15 +109,17 @@ async def process_settings(request: Request, uid: str):
104
  async def process_video_api(video_path: str = Form(...),
105
  temp_dir_path: str = Form(...),
106
  srt_string: str = Form(...),
 
107
  fontsize: Optional[int] = Form(42),
108
  font: Optional[str] = Form("Helvetica"),
109
  bg_color: Optional[str] = Form("#070a13b3"),
110
  text_color: Optional[str] = Form("white"),
 
111
  caption_mode: Optional[str] = Form("desktop"),
112
  temp_dir: TemporaryDirectory = Depends(get_temp_dir)
113
  ):
114
  try:
115
- output_path = process_video(video_path, srt_string, fontsize, font, bg_color, text_color, caption_mode)
116
  with open(os.path.join(temp_dir.name, f"{video_path.split('.')[0]}.srt"), 'w+') as temp_srt_file:
117
  logging.info("Processing the video...")
118
  temp_srt_file.write(srt_string)
 
16
  from pydantic import BaseModel, field_validator
17
  from cachetools import TTLCache
18
 
19
+ ## THIS IS A BREAKING CHANGE. SRT FILE INPUT DEPRECATED.
20
+ ## DONE: separate transcriber from subtitler logic.
21
+ ## DONE: improve loading spinner. (redirect)
22
  ## DONE: fix tempdir cleanup
23
  ## DONE: add transcription preview component + allow for interactive validation of transcription in-browser.
24
+ ## TODO: add word level highlighting option. WIP (word background margins need to be addressed; mobile mode needs work in json mode)
25
  ## TODO: improve UI
26
 
27
  app = FastAPI()
 
79
  with open(video_path, 'wb') as f:
80
  shutil.copyfileobj(video_file.file, f)
81
 
82
+ transcription_text, transcription_json = transcriber(video_path, max_words_per_line, task, model_version)
83
 
84
  uid = str(uuid4())
85
+ cache[uid] = {
86
+ "video_path": video_path,
87
+ "transcription_text": transcription_text,
88
+ "transcription_json": transcription_json,
89
+ "temp_dir_path": temp_dir.name}
90
  return RedirectResponse(url=f"/process_settings/?uid={uid}", status_code=303)
91
 
92
  except Exception as e:
 
99
  raise HTTPException(404, "Data not found")
100
  return templates.TemplateResponse("process_settings.html", {
101
  "request": request,
102
+ "transcription_text": data["transcription_text"],
103
+ "transcription_json": data["transcription_json"],
104
  "video_path": data["video_path"],
105
  "temp_dir_path": data["temp_dir_path"]
106
  })
 
109
  async def process_video_api(video_path: str = Form(...),
110
  temp_dir_path: str = Form(...),
111
  srt_string: str = Form(...),
112
+ srt_json: str = Form(...),
113
  fontsize: Optional[int] = Form(42),
114
  font: Optional[str] = Form("Helvetica"),
115
  bg_color: Optional[str] = Form("#070a13b3"),
116
  text_color: Optional[str] = Form("white"),
117
+ highlight_mode: Optional[bool] = Form(False),
118
  caption_mode: Optional[str] = Form("desktop"),
119
  temp_dir: TemporaryDirectory = Depends(get_temp_dir)
120
  ):
121
  try:
122
+ output_path = process_video(video_path, srt_string, srt_json, fontsize, font, bg_color, text_color, highlight_mode, caption_mode)
123
  with open(os.path.join(temp_dir.name, f"{video_path.split('.')[0]}.srt"), 'w+') as temp_srt_file:
124
  logging.info("Processing the video...")
125
  temp_srt_file.write(srt_string)
static/process_settings.html CHANGED
@@ -8,14 +8,44 @@
8
  label, select, input, textarea { display: block; width: 100%; margin-bottom: 1rem; }
9
  textarea { height: 200px; font-family: monospace; }
10
  input[type="submit"] { background: #4CAF50; color: white; padding: 0.8rem; border: none; cursor: pointer; }
 
 
 
 
 
 
 
 
 
 
 
11
  </style>
12
  </head>
13
  <body>
14
  <form action="/process_video/" method="post">
15
  <h2>Step 2: Edit Transcription & Style</h2>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- <label for="srt_string">Correct Transcription</label>
18
- <textarea name="srt_string" id="srt_string">{{ transcription }}</textarea>
 
 
19
 
20
  <label for="fontsize">Font size</label>
21
  <input type="number" name="fontsize" value="42">
@@ -78,6 +108,11 @@
78
  // Populate dropdowns with defaults
79
  populateDropdown('font', '/static/fonts.txt', DEFAULT_FONT);
80
  populateDropdown('text_color', '/static/colors.txt', DEFAULT_COLOR);
 
 
 
 
 
81
  </script>
82
  </body>
83
  </html>
 
8
  label, select, input, textarea { display: block; width: 100%; margin-bottom: 1rem; }
9
  textarea { height: 200px; font-family: monospace; }
10
  input[type="submit"] { background: #4CAF50; color: white; padding: 0.8rem; border: none; cursor: pointer; }
11
+ .radio-container {
12
+ display: flex;
13
+ gap: 1rem;
14
+ margin-bottom: 1rem;
15
+ }
16
+
17
+ .radio-option {
18
+ display: flex;
19
+ flex-direction: column;
20
+ align-items: center;
21
+ }
22
  </style>
23
  </head>
24
  <body>
25
  <form action="/process_video/" method="post">
26
  <h2>Step 2: Edit Transcription & Style</h2>
27
+
28
+ <div class="radio-container">
29
+ <div class="radio-option">
30
+ <label for="mode_normal">Normal</label>
31
+ <input type="radio" name="highlight_mode" value="false" id="mode_normal" checked onchange="toggleTranscriptionFields()">
32
+ </div>
33
+ <div class="radio-option">
34
+ <label for="mode_highlight">Word-level</label>
35
+ <input type="radio" name="highlight_mode" value="true" id="mode_highlight" onchange="toggleTranscriptionFields()">
36
+ </div>
37
+ </div>
38
+
39
+ <!-- Textareas -->
40
+ <div id="normal_input">
41
+ <label for="srt_string">Transcription (SRT)</label>
42
+ <textarea name="srt_string" id="srt_string">{{ transcription_text }}</textarea>
43
+ </div>
44
 
45
+ <div id="highlight_input" style="display: none;">
46
+ <label for="srt_json">Transcription (JSON)</label>
47
+ <textarea name="srt_json" id="srt_json">{{ transcription_json }}</textarea>
48
+ </div>
49
 
50
  <label for="fontsize">Font size</label>
51
  <input type="number" name="fontsize" value="42">
 
108
  // Populate dropdowns with defaults
109
  populateDropdown('font', '/static/fonts.txt', DEFAULT_FONT);
110
  populateDropdown('text_color', '/static/colors.txt', DEFAULT_COLOR);
111
+ function toggleTranscriptionFields() {
112
+ const isHighlight = document.getElementById('mode_highlight').checked;
113
+ document.getElementById('normal_input').style.display = isHighlight ? 'none' : 'block';
114
+ document.getElementById('highlight_input').style.display = isHighlight ? 'block' : 'none';
115
+ }
116
  </script>
117
  </body>
118
  </html>
utils/process_video.py CHANGED
@@ -1,17 +1,19 @@
1
- import logging, os
2
  from utils.subtitler import subtitler
3
 
4
  def process_video(invideo_file: str,
5
  srt_string:str,
 
6
  fontsize:str,
7
  font:str,
8
  bg_color:str,
9
  text_color:str,
 
10
  caption_mode:str
11
  ):
12
  invideo_path_parts = os.path.normpath(invideo_file).split(os.path.sep)
13
  VIDEO_NAME = os.path.basename(invideo_file)
14
  OUTVIDEO_PATH = os.path.join(os.path.normpath('/'.join(invideo_path_parts[:-1])), f"result_{VIDEO_NAME}")
15
  logging.info("Subtitling...")
16
- subtitler(invideo_file, srt_string, OUTVIDEO_PATH, fontsize, font, bg_color, text_color, caption_mode)
17
  return OUTVIDEO_PATH
 
1
+ import logging, os, json
2
  from utils.subtitler import subtitler
3
 
4
  def process_video(invideo_file: str,
5
  srt_string:str,
6
+ srt_json: str,
7
  fontsize:str,
8
  font:str,
9
  bg_color:str,
10
  text_color:str,
11
+ highlight_mode: bool,
12
  caption_mode:str
13
  ):
14
  invideo_path_parts = os.path.normpath(invideo_file).split(os.path.sep)
15
  VIDEO_NAME = os.path.basename(invideo_file)
16
  OUTVIDEO_PATH = os.path.join(os.path.normpath('/'.join(invideo_path_parts[:-1])), f"result_{VIDEO_NAME}")
17
  logging.info("Subtitling...")
18
+ subtitler(invideo_file, srt_string, srt_json, OUTVIDEO_PATH, fontsize, font, bg_color, text_color, highlight_mode, caption_mode)
19
  return OUTVIDEO_PATH
utils/subtitler.py CHANGED
@@ -1,5 +1,5 @@
1
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
2
- import os
3
 
4
  def parse_srt(srt_string):
5
  """Parse the SRT string and return a list of (start, end, text) for each subtitle."""
@@ -27,31 +27,77 @@ def filter_caption_width(caption_mode:str):
27
  caption_height_ratio = 0.7
28
  return caption_width_ratio, caption_height_ratio
29
 
30
- def subtitler(video_file:str,
31
- srt_string:str,
32
- output_file:str,
33
- fontsize:int,
 
 
34
  font: str,
35
- bg_color:str,
36
- text_color:str,
37
- caption_mode:str
 
38
  ):
39
- """Add subtitles from an SRT string to a video."""
40
  video_file = os.path.abspath(video_file)
41
  output_file = os.path.abspath(output_file)
42
  clip = VideoFileClip(filename=video_file, target_resolution=None)
43
- subtitles = parse_srt(srt_string)
44
  subtitle_clips = []
 
45
  caption_width_ratio, caption_height_ratio = filter_caption_width(caption_mode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  for start, end, text in subtitles:
47
- # Create TextClip with specified styling
48
- # To get a list of possible color and font values run: print(TextClip.list("font"), '\n\n', TextClip.list("color"))
49
- txt_clip = TextClip(text, fontsize=fontsize, color=text_color, font=font, method='caption',
50
- bg_color=bg_color, align='center', size=(clip.w*caption_width_ratio, None))
51
- txt_clip = txt_clip.set_position(('center', 'bottom')).set_duration(clip.duration).set_start(start).set_end(end)
52
- subtitle_x_position = 'center'
53
- subtitle_y_position = clip.h * caption_height_ratio
54
- text_position = (subtitle_x_position, subtitle_y_position)
55
- subtitle_clips.append(txt_clip.set_position(text_position))
 
56
  video = CompositeVideoClip(size=None, clips=[clip] + subtitle_clips)
57
  video.write_videofile(output_file, codec='libx264', audio_codec='aac')
 
1
  from moviepy.editor import VideoFileClip, CompositeVideoClip, TextClip
2
+ import os, json
3
 
4
  def parse_srt(srt_string):
5
  """Parse the SRT string and return a list of (start, end, text) for each subtitle."""
 
27
  caption_height_ratio = 0.7
28
  return caption_width_ratio, caption_height_ratio
29
 
30
+
31
+ def subtitler(video_file: str,
32
+ srt_string: str,
33
+ srt_json: str,
34
+ output_file: str,
35
+ fontsize: int,
36
  font: str,
37
+ bg_color: str,
38
+ text_color: str,
39
+ highlight_mode: bool,
40
+ caption_mode: str
41
  ):
42
+ """Add subtitles to a video, with optional word-level highlighting."""
43
  video_file = os.path.abspath(video_file)
44
  output_file = os.path.abspath(output_file)
45
  clip = VideoFileClip(filename=video_file, target_resolution=None)
46
+
47
  subtitle_clips = []
48
+
49
  caption_width_ratio, caption_height_ratio = filter_caption_width(caption_mode)
50
+ subtitle_y_position = clip.h * caption_height_ratio
51
+ if highlight_mode:
52
+ srt_data = json.loads(json.dumps(eval(srt_json)))
53
+ for line in srt_data.get("lines", []):
54
+ line_start = float(line["start"])
55
+ line_end = float(line["end"])
56
+ line_text = line["text"]
57
+
58
+ base_clip = TextClip(line_text, fontsize=fontsize, color=text_color, font=font, method='label')
59
+ base_clip = base_clip.set_start(line_start).set_end(line_end)
60
+
61
+ # Center the full line
62
+ line_width = base_clip.w
63
+ x_center = (clip.w - line_width) // 2
64
+ base_clip = base_clip.set_position((x_center, subtitle_y_position))
65
+ subtitle_clips.append(base_clip)
66
+
67
+ # Calculate word-level highlight positions
68
+ current_x = x_center
69
+ for word_info in line["words"]:
70
+ word = word_info["word"]
71
+ word_start = float(word_info["start"])
72
+ word_end = float(word_info["end"])
73
+
74
+ # Create a background-only word clip
75
+ word_clip = TextClip(word, fontsize=fontsize, color=text_color, stroke_color=text_color, stroke_width=2, font=font,
76
+ method='label', bg_color="LightBlue")
77
+ word_clip = word_clip.set_start(word_start).set_end(word_end)
78
+ word_clip = word_clip.set_position((current_x - 7.5, subtitle_y_position))
79
+ subtitle_clips.append(word_clip)
80
+
81
+ space_width = TextClip(" ", fontsize=fontsize, font=font, method='label').w
82
+ current_x += word_clip.w + space_width
83
+ video = CompositeVideoClip(size=None, clips=[clip] + subtitle_clips)
84
+ video.write_videofile(output_file, codec='libx264', audio_codec='aac')
85
+ return
86
+ # Normal mode
87
+ subtitles = parse_srt(srt_string)
88
+ subtitle_x_position = 'center'
89
+ subtitle_y_position = clip.h * caption_height_ratio
90
+ text_position = (subtitle_x_position, subtitle_y_position)
91
  for start, end, text in subtitles:
92
+ txt_clip = TextClip(text,
93
+ fontsize=fontsize,
94
+ color=text_color,
95
+ font=font,
96
+ method='caption',
97
+ bg_color=bg_color,
98
+ align='center',
99
+ size=(clip.w * caption_width_ratio, None))
100
+ txt_clip = txt_clip.set_start(start).set_end(end).set_position(text_position)
101
+ subtitle_clips.append(txt_clip)
102
  video = CompositeVideoClip(size=None, clips=[clip] + subtitle_clips)
103
  video.write_videofile(output_file, codec='libx264', audio_codec='aac')
utils/transcriber.py CHANGED
@@ -19,4 +19,4 @@ def transcriber(invideo_file:str,
19
  model_version=model_version,
20
  api_name="/predict"
21
  )
22
- return result[0]
 
19
  model_version=model_version,
20
  api_name="/predict"
21
  )
22
+ return result[0], result[3]