RamAnanth1 commited on
Commit
ee0e4c2
·
1 Parent(s): b9354c2

Delete whisper_post_processor.py

Browse files
Files changed (1) hide show
  1. whisper_post_processor.py +0 -46
whisper_post_processor.py DELETED
@@ -1,46 +0,0 @@
1
- from interpreter import WhisperInterpreter
2
- from utils import VIDEO_INFO, json_dump
3
- from yt_dlp.postprocessor import PostProcessor
4
- from datasets import Dataset
5
- import re
6
-
7
- class WhisperPP(PostProcessor):
8
- def __init__(self,data,**whisper_options):
9
- super().__init__()
10
- self._options = whisper_options
11
- interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
12
- self.data = data
13
- self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
14
- self._write = self._options.pop("write")
15
- self.videos_to_process = self._options.pop("number_videos",0)
16
- self.repoId = self._get_name()
17
-
18
- def run(self, info):
19
- self.to_screen(f"Processing Video {info['id']}")
20
- result = {key: info[key] for key in VIDEO_INFO}
21
- result.update(self._process(info["filepath"], **self._options))
22
- self.to_screen(f"Processed Video {info['id']} and appended results.")
23
- self._update_data(result)
24
- if self._write:
25
- json_dump(result, f"{info['filepath'].split('.')[0]}.json")
26
- return [], info
27
-
28
- def _update_data(self, record):
29
- dataType = type(self.data)
30
- if dataType == list:
31
- self.data.append(record)
32
- else:
33
- self.data = self.data.add_item(record)
34
- if self.data.num_rows >= self.videos_to_process and self.videos_to_process != 0:
35
- self.data.push_to_hub(self.repoId)
36
-
37
- def get_data(self):
38
- return self.data
39
-
40
- def _get_name(self):
41
- if self.data.info.download_checksums is not None:
42
- regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
43
- repoId = re.compile(regex)
44
- url = list(self.data.info.download_checksums.keys())[0]
45
- return repoId.findall(url)[0]
46
- return ""