File size: 1,658 Bytes
b9354c2
 
 
 
 
 
 
d297f8f
b9354c2
 
 
 
 
 
 
d297f8f
b9354c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from interpreter import WhisperInterpreter
from utils import VIDEO_INFO, json_dump
from yt_dlp.postprocessor import PostProcessor
from datasets import Dataset
import re

class WhisperPP(PostProcessor):
  def __init__(self,data,name, **whisper_options):
    super().__init__()
    self._options = whisper_options
    interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
    self.data = data
    self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
    self._write = self._options.pop("write")
    self.videos_to_process = self._options.pop("number_videos",0)
    self.repoId = name
  
  def run(self, info):
    self.to_screen(f"Processing Video {info['id']}")
    result = {key: info[key] for key in VIDEO_INFO}
    result.update(self._process(info["filepath"], **self._options))
    self.to_screen(f"Processed Video {info['id']} and appended results.")
    self._update_data(result)
    if self._write:
      json_dump(result, f"{info['filepath'].split('.')[0]}.json")
    return [], info

  def _update_data(self, record):
    dataType = type(self.data)
    if dataType == list:
      self.data.append(record)
    else:
      self.data = self.data.add_item(record)
      if self.data.num_rows >= self.videos_to_process and self.videos_to_process != 0:
        self.data.push_to_hub(self.repoId)

  def get_data(self):
    return self.data

  def _get_name(self):
    if self.data.info.download_checksums is not None:
      regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
      repoId = re.compile(regex)
      url = list(self.data.info.download_checksums.keys())[0]
      return repoId.findall(url)[0]
    return ""