Youtube-to-HF-Dataset / dataset /transcript_dataset.py
RamAnanth1's picture
Update dataset/transcript_dataset.py
698ae10
raw
history blame
2.62 kB
import glob
import os
import validators
import pandas as pd
from downloader import WhisperPP, YoutubeDownloader
from interpreter import WhisperInterpreter
from datasets import load_dataset, concatenate_datasets, Dataset
from dataset.hf_dataset import HFDataset
class TranscriptDataset(HFDataset):
def __init__(self, name, token) -> None:
self.name = name
self.token = token
super().__init__(name, token)
def generate_dataset(self, input, download_path, overwrite, whisper_config):
if validators.url(input):
self.from_url(input, download_path, overwrite, **whisper_config)
else:
self.from_files(input, overwrite, **whisper_config)
def from_url(self, url: str, download_path: str = "tmp/", overwrite: bool = False, **whisper_config: dict) -> None:
if self.is_empty:
emptyDataset = self.dataset
else:
#emptyDataset=self.dataset["train"].filter(lambda e: e["id"] is None)
emptyDataset=self.dataset["train"]
whisper_config["number_videos"] = 5
whisperPP = WhisperPP(emptyDataset, self.name, **whisper_config)
downloader = YoutubeDownloader(download_path)
if not overwrite:
downloader.config["download_archive"] = os.path.join(download_path,"video_record.txt")
self._fill_archive(downloader.config["download_archive"])
downloader.download(url, whisperPP)
self._concatenate_datasets(whisperPP.get_data())
def from_files(self, input:str, overwrite: bool = False, **whisper_config):
if (whisper_config.get("mode", None) is not None):
interpreter = WhisperInterpreter(whisper_config.pop("model_size"))
process = getattr(interpreter, whisper_config.pop("mode"))
result = process(input, **whisper_config)
if type(result) == list:
dataset = Dataset.from_list(result)
else:
dataset = Dataset.from_list([result])
else:
fileName = "tmp/*.json" if os.path.isdir(input) else input
dataset=load_dataset("json", data_files=glob.glob(fileName), split="train")
self._concatenate_datasets(dataset)
def _fill_archive(self, archive_file):
if not self.is_empty:
with open(archive_file, "w") as f:
for id in self.dataset["train"]["id"]:
f.write(f"youtube {id}\n")
def _concatenate_datasets(self, dataset):
if not self.is_empty:
selectedIDs = list(set(dataset["id"])-set(self.dataset["train"]["id"]))
filteredDataset = dataset.filter(lambda element: element["id"] in selectedIDs)
self.dataset["train"] = concatenate_datasets([self.dataset["train"],filteredDataset])
else:
self.dataset = dataset