Spaces:

tpha4308
/

video-qa

Sleeping

App Files Files Community

Thao Pham commited on Mar 31

Commit

f33d0ed

1 Parent(s): 9c77f6f

add utils function for transcribe

Browse files

Files changed (1) hide show

utils.py +40 -2

utils.py CHANGED Viewed

@@ -7,6 +7,46 @@ from tqdm import tqdm
 from pytubefix import YouTube, Stream
 import cv2
 import json
 # Taken from the course: https://www.deeplearning.ai/short-courses/multimodal-rag-chat-with-videos/
 def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int=-1) -> str:
@@ -14,8 +54,6 @@ def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int=-1) -> str:
     if format == 'vtt':
         write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
-    elif format == 'srt':
-        write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
     else:
         raise Exception("Unknown format " + format)

 from pytubefix import YouTube, Stream
 import cv2
 import json
+import textwrap
+# helper function for convert time in second to time format for .vtt or .srt file
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+def _processText(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)
+# helper function to convert transcripts generated by whisper to .vtt file
+def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        text = _processText(segment['text'], maxLineWidth).replace('-->', '->')
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
 # Taken from the course: https://www.deeplearning.ai/short-courses/multimodal-rag-chat-with-videos/
 def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int=-1) -> str:
     if format == 'vtt':
         write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
     else:
         raise Exception("Unknown format " + format)