Spaces:
Runtime error
Runtime error
from interpreter import WhisperInterpreter | |
from utils import VIDEO_INFO, json_dump | |
from yt_dlp.postprocessor import PostProcessor | |
from datasets import Dataset | |
import re | |
class WhisperPP(PostProcessor): | |
def __init__(self,data,name, **whisper_options): | |
super().__init__() | |
self._options = whisper_options | |
interpreter = WhisperInterpreter(self._options.pop("model_size","base")) | |
self.data = data | |
self._process = getattr(interpreter, self._options.pop("mode","transcribe")) | |
self._write = self._options.pop("write") | |
self.videos_to_process = self._options.pop("number_videos",0) | |
self.repoId = name | |
def run(self, info): | |
self.to_screen(f"Processing Video {info['id']}") | |
result = {key: info[key] for key in VIDEO_INFO} | |
result.update(self._process(info["filepath"], **self._options)) | |
self.to_screen(f"Processed Video {info['id']} and appended results.") | |
self._update_data(result) | |
if self._write: | |
json_dump(result, f"{info['filepath'].split('.')[0]}.json") | |
return [], info | |
def _update_data(self, record): | |
dataType = type(self.data) | |
if dataType == list: | |
self.data.append(record) | |
else: | |
self.data = self.data.add_item(record) | |
if self.data.num_rows >= self.videos_to_process and self.videos_to_process != 0: | |
self.data.push_to_hub(self.repoId) | |
def get_data(self): | |
return self.data | |
def _get_name(self): | |
if self.data.info.download_checksums is not None: | |
regex = r"(?<=datasets\/)(.*?)(?=\/resolve)" | |
repoId = re.compile(regex) | |
url = list(self.data.info.download_checksums.keys())[0] | |
return repoId.findall(url)[0] | |
return "" |